Skip to content

Commit

Permalink
Merge pull request #1457 from bgyori/pubmed_mesh
Browse files Browse the repository at this point in the history
Extend MeSH-PubMed and NCBI Taxonomy connection
  • Loading branch information
bgyori authored Aug 13, 2024
2 parents 19b1429 + c71e975 commit ab601b7
Show file tree
Hide file tree
Showing 5 changed files with 30,836 additions and 30,780 deletions.
12 changes: 11 additions & 1 deletion indra/databases/mesh_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
mesh_name_to_id_name = {}
mesh_id_to_tree_numbers = {}
mesh_supp_to_primary = {}
mesh_to_ncbitaxon = {}
ncbitaxon_to_mesh = {}


def _load_mesh_file(path, supplementary):
Expand All @@ -31,7 +33,15 @@ def _load_mesh_file(path, supplementary):
mesh_id, mesh_label, mesh_terms_str, mapped_to_str = terms
mesh_supp_to_primary[mesh_id] = mapped_to_str.split(',')
else:
mesh_id, mesh_label, mesh_terms_str, tree_number_str = terms
mesh_id, mesh_label, mesh_terms_str, \
tree_number_str, taxon_ids = terms
if taxon_ids:
taxon_ids = taxon_ids.split('|')
for taxon_id in taxon_ids:
# Note that these seem to be one-to-one so
# we don't need to worry about overwriting
ncbitaxon_to_mesh[taxon_id] = mesh_id
mesh_to_ncbitaxon[mesh_id] = taxon_ids
# This is a rare corner case where an entry is outside the
# tree structure, e.g., D005260, D008297
if not tree_number_str:
Expand Down
65 changes: 53 additions & 12 deletions indra/literature/pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,31 @@ def get_ids_for_gene(hgnc_name, **kwargs):
return ids


def get_mesh_term_search_str(mesh_id, major_topic=False):
"""Return a search string for a given MeSH ID.
Parameters
----------
mesh_id : str
The MeSH ID of a term to search for, e.g., D009101.
major_topic : bool
If True, the given MeSH ID is considered as a major topic.
Default: False
"""
from indra.databases import mesh_client
mesh_name = mesh_client.get_mesh_name(mesh_id)
if not mesh_name:
logger.error('Could not get MeSH name for ID %s' % mesh_id)
return None
if mesh_id.startswith('C') and not major_topic:
# Get pmids for supplementary concepts as well
search_term = f'{mesh_name} [nm]'
return search_term
suffix = 'majr' if major_topic else 'mh'
search_term = '%s [%s]' % (mesh_name, suffix)
return search_term


def get_ids_for_mesh(mesh_id, major_topic=False, **kwargs):
"""Return PMIDs that are annotated with a given MeSH ID.
Expand All @@ -214,19 +239,35 @@ def get_ids_for_mesh(mesh_id, major_topic=False, **kwargs):
Any further PudMed search arguments that are passed to
get_ids.
"""
from indra.databases import mesh_client
mesh_name = mesh_client.get_mesh_name(mesh_id)
if not mesh_name:
logger.error('Could not get MeSH name for ID %s' % mesh_id)
search_str = get_mesh_term_search_str(mesh_id, major_topic)
ids = get_ids(search_str, use_text_word=False, **kwargs)
return ids


def get_ids_for_mesh_terms(mesh_terms, major_topics=None, **kwargs):
"""Return PMIDs that are annotated with a given list of MeSH terms.
Parameters
----------
mesh_terms : list of str
A list of MeSH IDs of terms to search for, e.g., ['D009101', 'D009102'].
major_topics : Optional[list of bool]
A list of booleans indicating whether the corresponding MeSH term
should be considered as a major topic. If None, all terms are considered
as major topics.
**kwargs
Any further PudMed search arguments that are passed to
get_ids.
"""
if major_topics is None:
major_topics = [False] * len(mesh_terms)
search_strs = [get_mesh_term_search_str(mesh_id, major_topic)
for mesh_id, major_topic in zip(mesh_terms, major_topics)]
search_strs = [s for s in search_strs if s is not None]
if not search_strs:
return []
suffix = 'majr' if major_topic else 'mh'
search_term = '%s [%s]' % (mesh_name, suffix)
ids = get_ids(search_term, use_text_word=False, **kwargs)
if mesh_id.startswith('C') and not major_topic:
# Get pmids for supplementary concepts as well
search_term = '%s [nm]' % mesh_name
ids2 = get_ids(search_term, use_text_word=False, **kwargs)
ids = list(set(ids) | set(ids2))
search_str = ' AND '.join([f'({s})' for s in search_strs])
ids = get_ids(search_str, use_text_word=False, **kwargs)
return ids


Expand Down
Loading

0 comments on commit ab601b7

Please sign in to comment.