Skip to content

Commit

Permalink
feature: add CV-derived dissociation methods and energies, update ven…
Browse files Browse the repository at this point in the history
…dored PSI-MS CV
  • Loading branch information
mobiusklein committed Sep 8, 2024
1 parent fe35bda commit cd9733c
Show file tree
Hide file tree
Showing 16 changed files with 842 additions and 314 deletions.
14 changes: 7 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@ update-cv:
--location \
https://github.com/HUPO-PSI/psi-ms-CV/releases/latest/download/psi-ms.obo | gzip -c > cv/psi-ms.obo.gz

gzip -d -c cv/psi-ms.obo.gz | head -n 5

update-cv-terms:
cog -c -r -U src/meta/software.rs src/meta/instrument.rs src/meta/file_description.rs src/io/mzml/writer.rs
cog -c -r -U src/meta/software.rs src/meta/instrument.rs src/meta/file_description.rs src/io/mzml/writer.rs src/meta/activation.rs

changelog version:
#!/usr/bin/env python
Expand Down
116 changes: 116 additions & 0 deletions cv/extract_activation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import gzip
import json
import io
import itertools
import re

from typing import Tuple, Dict, Set, List

import fastobo
from fastobo.term import (
TermFrame,
IsAClause,
NameClause,
DefClause,
)

from fastobo.doc import OboDoc

from fastobo.id import PrefixedIdent

ROOT_TERM = PrefixedIdent("MS", "1000044")

segment_pattern = re.compile(r"(_[a-zA-Z])")


def collect_components(
cv: OboDoc, base_term: PrefixedIdent
) -> Tuple[Set[PrefixedIdent], Dict[PrefixedIdent, TermFrame]]:
term: TermFrame
id_to_clause = {}
component_ids = {base_term}
# Make multiple passes
for term in itertools.chain(cv, cv):
id_to_clause[term.id] = term
for clause in term:
if isinstance(clause, IsAClause):
if clause.term in component_ids:
component_ids.add(term.id)
return component_ids, id_to_clause


def format_name(match: re.Match) -> str:
return match.group(1)[-1].upper()


def find_name(term: TermFrame):
for clause in term:
if isinstance(clause, NameClause):
name = str(clause.name)
return name
else:
raise LookupError(f"Term name not found for {term.id!s}")


def make_entry_for(term: TermFrame):
name = None
parents = []
descr = ""
for clause in term:
if isinstance(clause, NameClause):
name = str(clause.name)
if isinstance(clause, IsAClause):
parents.append(str(clause.term))
if isinstance(clause, DefClause):
descr = re.sub(
r"(\[|\])",
lambda m: "\\\\" + m.group(1),
str(clause.definition).replace('"', "'"),
)

vname = name
if "-" in vname:
vname = vname.replace("-", "_")
if ":" in vname:
vname = vname.replace(":", "_")
if "/" in vname:
vname = vname.replace("/", "_")
if "+" in vname:
vname = vname.replace("+", "plus")
if "!" in vname:
vname = vname.replace("!", "_")

vname = segment_pattern.sub(format_name, vname.replace(" ", "_"))
vname = vname[0].upper() + vname[1:]

if vname[0].isdigit():
vname = "_" + vname

return f"""
#[term(cv=MS, accession={term.id.local}, name="{name}", flags={{{0}}}, parents={{{json.dumps(parents)}}})]
#[doc = "{name} - {descr}"]
{vname},"""


def generate_term_enum(terms: List[TermFrame], type_name: str):
buffer = io.StringIO()
buffer.write("pub enum $Term {".replace("$", type_name))
for term in terms:
buffer.write(make_entry_for(term))
buffer.write("\n}")
return buffer.getvalue()


def main():
cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz"))
term_ids, id_to_clause = collect_components(cv, ROOT_TERM)
t = find_name(id_to_clause[ROOT_TERM])
type_name = t.title().replace(" ", "")

term_specs = list(map(id_to_clause.get, sorted(term_ids)))
text = generate_term_enum(term_specs, type_name)
print(text)


if __name__ == "__main__":
main()
14 changes: 13 additions & 1 deletion cv/extract_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class ValueType(IntFlag):
"ionization-type": 'IonizationType',
"inlet-type": "InletType",
"detector-type": "DetectorType",
"collision-energy": "CollisionEnergy"
}


Expand All @@ -57,12 +58,23 @@ class ValueType(IntFlag):
"ionization-type": PrefixedIdent("MS", "1000008"),
"inlet-type": PrefixedIdent("MS", "1000007"),
"detector-type": PrefixedIdent("MS", "1000026"),
"collision-energy": PrefixedIdent("MS", "1000045"),
}


def make_parser():
parser = argparse.ArgumentParser()
parser.add_argument("component", choices=["mass-analyzer", "ionization-type", "inlet-type", "detector-type", "-"])
parser.add_argument(
"component",
choices=[
"mass-analyzer",
"ionization-type",
"inlet-type",
"detector-type",
"collision-energy",
"-",
],
)
parser.add_argument("-c", "--curie")
parser.add_argument("-t", "--type-name")
return parser
Expand Down
130 changes: 130 additions & 0 deletions cv/extract_energy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import gzip
import json
import io
import itertools
import re

from typing import Tuple, Dict, Set, List

import fastobo
from fastobo.term import (
TermFrame,
IsAClause,
NameClause,
DefClause,
)

from fastobo.doc import OboDoc

from fastobo.id import PrefixedIdent

ROOT_TERM = PrefixedIdent("MS", "1000045")
EXTRA_ROOTS = [
PrefixedIdent("MS", "1000138"),
PrefixedIdent("MS", "1002680"),
PrefixedIdent("MS", "1003410")
]

segment_pattern = re.compile(r"(_[a-zA-Z])")


def collect_components(
cv: OboDoc, base_term: PrefixedIdent
) -> Tuple[Set[PrefixedIdent], Dict[PrefixedIdent, TermFrame]]:
term: TermFrame
id_to_clause = {}
component_ids = {base_term}
# Make multiple passes
for term in itertools.chain(cv, cv):
id_to_clause[term.id] = term
for clause in term:
if isinstance(clause, IsAClause):
if clause.term in component_ids:
component_ids.add(term.id)
return component_ids, id_to_clause


def format_name(match: re.Match) -> str:
return match.group(1)[-1].upper()


def find_name(term: TermFrame):
for clause in term:
if isinstance(clause, NameClause):
name = str(clause.name)
return name
else:
raise LookupError(f"Term name not found for {term.id!s}")


def make_entry_for(term: TermFrame):
name = None
parents = []
descr = ""
for clause in term:
if isinstance(clause, NameClause):
name = str(clause.name)
if isinstance(clause, IsAClause):
parents.append(str(clause.term))
if isinstance(clause, DefClause):
descr = re.sub(
r"(\[|\])",
lambda m: "\\\\" + m.group(1),
str(clause.definition).replace('"', "'"),
)

vname = name
if "-" in vname:
vname = vname.replace("-", "_")
if ":" in vname:
vname = vname.replace(":", "_")
if "/" in vname:
vname = vname.replace("/", "_")
if "+" in vname:
vname = vname.replace("+", "plus")
if "!" in vname:
vname = vname.replace("!", "_")

vname = segment_pattern.sub(format_name, vname.replace(" ", "_"))
vname = vname[0].upper() + vname[1:]

if vname[0].isdigit():
vname = "_" + vname

return f"""
#[term(cv=MS, accession={term.id.local}, name="{name}", flags={{{0}}}, parents={{{json.dumps(parents)}}})]
#[doc = "{name} - {descr}"]
{vname}(f32),"""


def generate_term_enum(terms: List[TermFrame], type_name: str):
buffer = io.StringIO()
buffer.write("pub enum $Term {".replace("$", type_name))
for term in terms:
buffer.write(make_entry_for(term))
buffer.write("\n}")
return buffer.getvalue()


def merge_term_sets(term_sets: List[Tuple[Set, Dict]]) -> Tuple[Set, Dict]:
base_term_ids, base_id_to_clause = map(lambda x: x.copy(), term_sets[0])
for (term_ids, id_to_clause) in term_sets[1:]:
base_term_ids.update(term_ids)
base_id_to_clause.update(id_to_clause)
return (base_term_ids, base_id_to_clause)


def main():
cv: OboDoc = fastobo.load(gzip.open("./cv/psi-ms.obo.gz"))
term_ids, id_to_clause = merge_term_sets([collect_components(cv, root) for root in [ROOT_TERM] + EXTRA_ROOTS])
# t = find_name(id_to_clause[ROOT_TERM])
# type_name = t.title().replace(" ", "")
type_name = "DissociationEnergy"

term_specs = list(map(id_to_clause.get, sorted(term_ids)))
text = generate_term_enum(term_specs, type_name)
print(text)


if __name__ == "__main__":
main()
Binary file modified cv/psi-ms.obo.gz
Binary file not shown.
Loading

0 comments on commit cd9733c

Please sign in to comment.