Address requested changes

HUPO-PSI · Oct 22, 2024 · 0c408c3 · 0c408c3
1 parent 5ecea8d
commit 0c408c3
Show file tree

Hide file tree

Showing 14 changed files with 816 additions and 329 deletions.
diff --git a/mzspeclib/analyte.py b/mzspeclib/analyte.py
@@ -5,7 +5,7 @@
 from pyteomics import proforma
 
 from mzspeclib.attributes import AttributedEntity, IdentifiedAttributeManager, AttributeManagedProperty, AttributeProxy, AttributeGroupFacet
-from mzspeclib.const import (ANALYTE_MIXTURE_TERM, CHARGE_STATE, PROFORMA_ION, PROFORMA_SEQ, STRIPPED_PEPTIDE_SEQ, FIRST_ANALYTE_KEY, FIRST_INTERPRETATION_KEY)
+from mzspeclib.const import (ANALYTE_MIXTURE, CHARGE_STATE, PROFORMA_ION, PROFORMA_SEQ, STRIPPED_PEPTIDE_SEQ, FIRST_ANALYTE_KEY, FIRST_INTERPRETATION_KEY)
 
 
 class _AnalyteMappingProxy(Mapping[str, 'Analyte']):
@@ -132,7 +132,7 @@ def __init__(self, id, attributes: Iterable = None, analytes: Dict = None, membe
 
     def _update_mixture_members_term(self):
         value = sorted(map(int, self.analytes.keys()))
-        self.replace_attribute(ANALYTE_MIXTURE_TERM, value)
+        self.replace_attribute(ANALYTE_MIXTURE, value)
 
     def get_analyte(self, analyte_id) -> 'Analyte':
         """Retrieve an analyte by its identifier"""

diff --git a/mzspeclib/backends/base.py b/mzspeclib/backends/base.py
@@ -21,13 +21,13 @@
 from mzspeclib.attributes import Attributed, AttributedEntity, AttributeSet, AttributeManagedProperty
 from mzspeclib.ontology import _VocabularyResolverMixin
 from mzspeclib.const import (
-    FORMAT_VERSION_TERM,
-    LIBRARY_NAME_TERM,
-    LIBRARY_IDENTIFIER_TERM,
-    LIBRARY_VERSION_TERM,
-    LIBRARY_URI_TERM,
-    LIBRARY_DESCRIPTION_TERM,
-    ANALYTE_MIXTURE_TERM,
+    FORMAT_VERSION,
+    LIBRARY_NAME,
+    LIBRARY_IDENTIFIER,
+    LIBRARY_VERSION,
+    LIBRARY_URI,
+    LIBRARY_DESCRIPTION,
+    ANALYTE_MIXTURE,
     LIBRARY_SPECTRUM_INDEX,
     LIBRARY_SPECTRUM_KEY
 )
@@ -40,7 +40,7 @@
 logger = logging.getLogger(__name__.rsplit(".", 1)[0])
 logger.addHandler(logging.NullHandler())
 
-ANALYTE_MIXTURE_CURIE = ANALYTE_MIXTURE_TERM.split("|")[0]
+ANALYTE_MIXTURE_CURIE = ANALYTE_MIXTURE.split("|")[0]
 
 DEFAULT_VERSION = '1.0'
 
@@ -98,20 +98,20 @@ def type_for_format(cls, format_or_extension: str) -> Type:
 
 class _LibraryViewMixin:
 
-    name = AttributeManagedProperty[str](LIBRARY_NAME_TERM)
-    identifier = AttributeManagedProperty[str](LIBRARY_IDENTIFIER_TERM)
-    description = AttributeManagedProperty[str](LIBRARY_DESCRIPTION_TERM)
-    uri = AttributeManagedProperty[str](LIBRARY_URI_TERM)
-    library_version = AttributeManagedProperty[str](LIBRARY_VERSION_TERM)
+    name = AttributeManagedProperty[str](LIBRARY_NAME)
+    identifier = AttributeManagedProperty[str](LIBRARY_IDENTIFIER)
+    description = AttributeManagedProperty[str](LIBRARY_DESCRIPTION)
+    uri = AttributeManagedProperty[str](LIBRARY_URI)
+    library_version = AttributeManagedProperty[str](LIBRARY_VERSION)
 
     @property
     def format_version(self):
         try:
-            value = self.get_attribute(FORMAT_VERSION_TERM)
+            value = self.get_attribute(FORMAT_VERSION)
             return value
         except KeyError:
             value = DEFAULT_VERSION
-            self.add_attribute(FORMAT_VERSION_TERM, value)
+            self.add_attribute(FORMAT_VERSION, value)
             return value
 
 
@@ -281,9 +281,9 @@ def _new_cluster(self) -> SpectrumCluster:
 
     def _analyte_interpretation_link(self, spectrum: Spectrum,
                                      interpretation: Interpretation):
-        if (interpretation.has_attribute(ANALYTE_MIXTURE_TERM) and
+        if (interpretation.has_attribute(ANALYTE_MIXTURE) and
             not interpretation.analytes):
-            analyte_ids = interpretation.get_attribute(ANALYTE_MIXTURE_TERM)
+            analyte_ids = interpretation.get_attribute(ANALYTE_MIXTURE)
             if isinstance(analyte_ids, str):
                 term = self.find_term_for(ANALYTE_MIXTURE_CURIE)
                 analyte_ids = term.value_type(analyte_ids)
@@ -699,7 +699,7 @@ def _filter_attributes(self, attributes: Attributed,
     def _not_analyte_mixture_term(self, attrib):
         if attrib:
             key = attrib[0]
-            if key == ANALYTE_MIXTURE_TERM:
+            if key == ANALYTE_MIXTURE:
                 return False
         return True
 

diff --git a/mzspeclib/backends/bibliospec.py b/mzspeclib/backends/bibliospec.py
@@ -17,8 +17,23 @@
 from mzspeclib.analyte import FIRST_ANALYTE_KEY, FIRST_INTERPRETATION_KEY, Analyte
 from mzspeclib.spectrum import Spectrum, SPECTRUM_NAME, CHARGE_STATE
 from mzspeclib.attributes import AttributeManager, Attributed
-
-from mzspeclib.backends.base import SpectralLibraryBackendBase, FORMAT_VERSION_TERM, DEFAULT_VERSION
+from mzspeclib.const import (
+    CHARGE_STATE,
+    LIBRARY_IDENTIFIER,
+    PROFORMA_ION,
+    LIBRARY_NAME,
+    SCAN_NUMBER,
+    SOURCE_FILE,
+    STRIPPED_PEPTIDE_SEQ,
+    RETENTION_TIME,
+    SW_VERSION,
+    THEORETICAL_MASS,
+    NUMBER_OF_REPLICATE_SPECTRA_AVAILABLE,
+    NUMBER_OF_REPLICATE_SPECTRA_USED,
+    LIBRARY_CREATION_SW,
+)
+
+from mzspeclib.backends.base import SpectralLibraryBackendBase, FORMAT_VERSION, DEFAULT_VERSION
 
 from mzspeclib.index.base import IndexBase
 
@@ -110,16 +125,16 @@ def __init__(self, filename, **kwargs):
 
     def read_header(self) -> bool:
         attribs = AttributeManager()
-        attribs.add_attribute(FORMAT_VERSION_TERM, DEFAULT_VERSION)
-        attribs.add_attribute("MS:1003207|library creation software", "Bibliospec")
+        attribs.add_attribute(FORMAT_VERSION, DEFAULT_VERSION)
+        attribs.add_attribute(LIBRARY_CREATION_SW, "Bibliospec")
 
         info = self.connection.execute("SELECT * FROM LibInfo;").fetchone()
         library_id = info['libLSID']
         _, pfx_name = library_id.split("bibliospec:")
         _, name = pfx_name.split(":", 1)
-        attribs.add_attribute("MS:1003188|library name", name)
-        attribs.add_attribute("MS:1003187|library identifier", library_id)
-        attribs.add_attribute("MS:1003200|software version", f"{info['majorVersion']}.{info['minorVersion']}")
+        attribs.add_attribute(LIBRARY_NAME, name)
+        attribs.add_attribute(LIBRARY_IDENTIFIER, library_id)
+        attribs.add_attribute(SW_VERSION, f"{info['majorVersion']}.{info['minorVersion']}")
         self.attributes = attribs
         return True
 
@@ -131,10 +146,10 @@ def _populate_analyte(self, analyte: Analyte, row: Mapping):
         Bibliospec only stores modifications as delta masses.
         """
         peptide = self._correct_modifications_in_sequence(row)
-        analyte.add_attribute("MS:1003169|proforma peptidoform sequence", str(peptide))
-        analyte.add_attribute("MS:1001117|theoretical mass", peptide.mass)
-        analyte.add_attribute("MS:1000888|stripped peptide sequence", row['peptideSeq'])
-        analyte.add_attribute(CHARGE_STATE, row['precursorCharge'])
+        peptide.charge_state = row['precursorCharge']
+        analyte.add_attribute(PROFORMA_ION, str(peptide))
+        analyte.add_attribute(THEORETICAL_MASS, peptide.mass)
+        analyte.add_attribute(STRIPPED_PEPTIDE_SEQ, row['peptideSeq'])
 
     def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
         """
@@ -156,13 +171,13 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
         spectrum.precursor_mz = info['precursorMZ']
         spectrum.add_attribute(CHARGE_STATE, info['precursorCharge'])
         try:
-            spectrum.add_attribute("MS:1000894|retention time", info['retentionTime'])
+            spectrum.add_attribute(RETENTION_TIME, info['retentionTime'])
         except KeyError:
             pass
 
         try:
-            spectrum.add_attribute("MS:1003069|number of replicate spectra available", info['copies'])
-            spectrum.add_attribute("MS:1003070|number of replicate spectra used", 1)
+            spectrum.add_attribute(NUMBER_OF_REPLICATE_SPECTRA_AVAILABLE, info['copies'])
+            spectrum.add_attribute(NUMBER_OF_REPLICATE_SPECTRA_USED, 1)
         except KeyError:
             pass
 
@@ -171,14 +186,14 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
 
         try:
             spectrum.add_attribute(
-                "MS:1003203|constituent spectrum file",
+                SOURCE_FILE,
                 self.connection.execute("SELECT fileName FROM SpectrumSourceFiles WHERE id = ?",
                                         (info['fileID'], )).fetchone()['fileName']
             )
         except KeyError:
             pass
         spectrum.add_attribute(
-            "MS:1003057|scan number",
+            SCAN_NUMBER,
             info["SpecIDinFile"]
         )
 

diff --git a/mzspeclib/backends/diann.py b/mzspeclib/backends/diann.py
@@ -11,23 +11,26 @@
 from pyteomics import proforma
 
 from mzspeclib import annotation
-from mzspeclib.backends.base import DEFAULT_VERSION, FORMAT_VERSION_TERM, LIBRARY_NAME_TERM, _CSVSpectralLibraryBackendBase
+from mzspeclib.backends.base import DEFAULT_VERSION, FORMAT_VERSION, LIBRARY_NAME, _CSVSpectralLibraryBackendBase
 from mzspeclib.backends.utils import open_stream, urlify
 from mzspeclib.spectrum import Spectrum, SPECTRUM_NAME
+from mzspeclib.const import (
+    PROFORMA_SEQ as PROFORMA_PEPTIDE_TERM,
+    PROFORMA_ION,
+    STRIPPED_PEPTIDE_SEQ as STRIPPED_PEPTIDE_TERM,
+    SELECTED_ION_MZ as SPECTRUM_SELECTED_ION_MZ,
+    CHARGE_STATE,
+    SOURCE_FILE,
+    CUSTOM_ATTRIBUTE_NAME,
+    CUSTOM_ATTRIBUTE_VALUE
+)
 
 
 def _rewrite_unimod_peptide_as_proforma(sequence: str) -> str:
     return sequence.replace("(", '[').replace(')', ']').replace("UniMod", "UNIMOD")
 
 
-CHARGE_STATE = "MS:1000041|charge state"
-SELECTED_ION_MZ = "MS:1003053|theoretical monoisotopic m/z"
-SOURCE_FILE = "MS:1003203|constituent spectrum file"
-STRIPPED_PEPTIDE_TERM = "MS:1000888|stripped peptide sequence"
-PROFORMA_PEPTIDE_TERM = "MS:1003169|proforma peptidoform sequence"
-
-CUSTOM_ATTRIBUTE_NAME = "MS:1003275|other attribute name"
-CUSTOM_ATTRIBUTE_VALUE = "MS:1003276|other attribute value"
+THEO_SELECTED_ION_MZ = "MS:1003053|theoretical monoisotopic m/z"
 
 NO_LOSS = 'noloss'
 
@@ -79,12 +82,12 @@ def _spectrum_aggregation_type(self):
 
     def read_header(self) -> bool:
         result = super().read_header()
-        self.add_attribute(FORMAT_VERSION_TERM, DEFAULT_VERSION)
+        self.add_attribute(FORMAT_VERSION, DEFAULT_VERSION)
         if hasattr(self.filename, 'name'):
             name = self.filename.name.replace(".gz", '').rsplit('.', 1)[0].split(os.sep)[-1]
         else:
             name = self.filename.replace(".gz", '').rsplit(".", 1)[0].split(os.sep)[-1]
-        self.add_attribute(LIBRARY_NAME_TERM, name)
+        self.add_attribute(LIBRARY_NAME, name)
         self.add_attribute("MS:1003207|library creation software", "MS:1003253|DIA-NN")
         return result
 
@@ -141,7 +144,7 @@ def _parse_from_buffer(self, buffer: List[Dict[str, Any]], spectrum_index: Optio
         descr = buffer[0]
 
         spec.add_attribute(SPECTRUM_NAME, descr['transition_group_id'])
-        spec.add_attribute(SELECTED_ION_MZ, float(descr['PrecursorMz']))
+        spec.add_attribute(SPECTRUM_SELECTED_ION_MZ, float(descr['PrecursorMz']))
 
         if 'FileName' in descr:
             spec.add_attribute(SOURCE_FILE, urlify(descr['FileName']))
@@ -163,9 +166,10 @@ def _parse_from_buffer(self, buffer: List[Dict[str, Any]], spectrum_index: Optio
 
         if 'PeptideSequence' in descr:
             analyte.add_attribute(STRIPPED_PEPTIDE_TERM, descr['PeptideSequence'])
-        analyte.add_attribute(PROFORMA_PEPTIDE_TERM, pf_seq)
+        peptide.charge_state = descr['PrecursorCharge']
+        analyte.add_attribute(PROFORMA_ION, str(peptide))
         analyte.add_attribute("MS:1001117|theoretical mass", peptide.mass)
-        analyte.add_attribute(CHARGE_STATE, int(descr['PrecursorCharge']))
+        spec.add_attribute(CHARGE_STATE, int(descr['PrecursorCharge']))
 
         protein_group_id = analyte.get_next_group_identifier()
         if "UniprotID" in descr:

diff --git a/mzspeclib/backends/encyclopedia.py b/mzspeclib/backends/encyclopedia.py
@@ -18,13 +18,11 @@
 from mzspeclib.spectrum import Spectrum, SPECTRUM_NAME, CHARGE_STATE
 from mzspeclib.attributes import AttributeManager, Attributed, Attribute
 
-from mzspeclib.backends.base import SpectralLibraryBackendBase, FORMAT_VERSION_TERM, DEFAULT_VERSION
+from mzspeclib.backends.base import SpectralLibraryBackendBase, FORMAT_VERSION, DEFAULT_VERSION
 
 from mzspeclib.index.base import IndexBase
-
-
-DECOY_SPECTRUM = "MS:1003192|decoy spectrum"
-DECOY_PEPTIDE_SPECTRUM = "MS:1003195|unnatural peptidoform decoy spectrum"
+from mzspeclib import const as c
+from mzspeclib.const import CHARGE_STATE, DECOY_SPECTRUM, DECOY_PEPTIDE_SPECTRUM, PROFORMA_ION, STRIPPED_PEPTIDE_SEQ, LIBRARY_CREATION_SW
 
 
 def _decode_peaks(record: sqlite3.Row):
@@ -96,8 +94,8 @@ def __init__(self, filename: str, **kwargs):
 
     def read_header(self) -> bool:
         attribs = AttributeManager()
-        attribs.add_attribute(FORMAT_VERSION_TERM, DEFAULT_VERSION)
-        attribs.add_attribute("MS:1003207|library creation software", "EncyclopeDIA")
+        attribs.add_attribute(FORMAT_VERSION, DEFAULT_VERSION)
+        attribs.add_attribute(LIBRARY_CREATION_SW, "EncyclopeDIA")
         self.attributes = attribs
         return True
 
@@ -109,10 +107,10 @@ def _populate_analyte(self, analyte: Analyte, row: Mapping[str, Any]):
         EncyclopeDIA only stores modifications as delta masses.
         """
         peptide = proforma.ProForma.parse(row['PeptideModSeq'])
-        analyte.add_attribute("MS:1003169|proforma peptidoform sequence", str(peptide))
-        analyte.add_attribute("MS:1001117|theoretical mass", peptide.mass)
-        analyte.add_attribute("MS:1000888|stripped peptide sequence", row['PeptideSeq'])
-        analyte.add_attribute(CHARGE_STATE, row['PrecursorCharge'])
+        peptide.charge_state = row['PrecursorCharge']
+        analyte.add_attribute(PROFORMA_ION, str(peptide))
+        analyte.add_attribute(c.THEORETICAL_MASS, peptide.mass)
+        analyte.add_attribute(STRIPPED_PEPTIDE_SEQ, row['PeptideSeq'])
 
         cursor = self.connection.execute(
             "SELECT ProteinAccession, isDecoy FROM peptidetoprotein WHERE PeptideSeq = ?;", (row['PeptideSeq'], ))
@@ -146,14 +144,15 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
         spectrum.key = info['rowid']
         spectrum.index = info['rowid'] - 1
         spectrum.precursor_mz = info['PrecursorMz']
+        spectrum.add_attribute(CHARGE_STATE, row["PrecursorCharge"])
         try:
-            spectrum.add_attribute("MS:1000894|retention time", info['RTInSeconds'] / 60.0)
+            spectrum.add_attribute(c.RETENTION_TIME, info['RTInSeconds'] / 60.0)
         except KeyError:
             pass
 
         try:
             spectrum.add_attribute(
-                "MS:1003203|constituent spectrum file",
+                c.SOURCE_FILE,
                 f"file://{info['SourceFile']}"
             )
         except KeyError:
@@ -173,7 +172,7 @@ def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
 
         mz_array, intensity_array = _decode_peaks(info)
         n_peaks = len(mz_array)
-        spectrum.add_attribute("MS:1003059|number of peaks", n_peaks)
+        spectrum.add_attribute(c.NUM_PEAKS, n_peaks)
 
         peak_list = []
         # EncyclopeDIA does not encode product ion identities

diff --git a/mzspeclib/backends/json.py b/mzspeclib/backends/json.py
@@ -15,12 +15,13 @@
 from mzspeclib.analyte import Analyte, Interpretation
 from mzspeclib.spectrum import Spectrum
 from mzspeclib.utils import ValidationWarning
+from mzspeclib.const import ATTRIBUTE_SET_NAME
 
 from .base import (
     DEFAULT_VERSION,
     SpectralLibraryBackendBase,
     SpectralLibraryWriterBase,
-    FORMAT_VERSION_TERM,
+    FORMAT_VERSION,
     AttributeSetTypes,
 )
 from .utils import open_stream
@@ -50,8 +51,7 @@
 INTERPRETATION_CLASSES = "interpretation_attribute_sets"
 CLUSTER_CLASSES = "cluster_attribute_sets"
 
-FORMAT_VERSION_ACC = FORMAT_VERSION_TERM.split("|")[0]
-ATTRIBUTE_SET_NAME = "MS:1003212|library attribute set name"
+FORMAT_VERSION_ACC = FORMAT_VERSION.split("|")[0]
 
 
 class JSONSpectralLibrary(SpectralLibraryBackendBase):
@@ -108,12 +108,12 @@ def _load_attribute_sets(self, attribute_sets: dict):
     def read_header(self) -> bool:
         if self.buffer:
             self._fill_attributes(self.buffer.get(LIBRARY_METADATA_KEY), self.attributes)
-            if not self.attributes.has_attribute(FORMAT_VERSION_TERM):
+            if not self.attributes.has_attribute(FORMAT_VERSION):
                 warnings.warn(
-                    f"Library does not have a {FORMAT_VERSION_TERM}, assuming current version",
+                    f"Library does not have a {FORMAT_VERSION}, assuming current version",
                     category=ValidationWarning,
                 )
-                attributes = [Attribute(FORMAT_VERSION_TERM, DEFAULT_VERSION)] + list(self.attributes)
+                attributes = [Attribute(FORMAT_VERSION, DEFAULT_VERSION)] + list(self.attributes)
                 self.attributes.clear()
                 self.attributes._attributes_from_iterable(attributes)
             self.analyte_attribute_sets.update(self._load_attribute_sets(self.buffer.get(ANALYTE_CLASSES, {})))