Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add curation tools #142

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions src/pybel_tools/citation_coocurrence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-

"""Build a network of citations connected by co-occurrence of entities."""

from collections import Counter, defaultdict

import click
import itertools as itt
import networkx as nx
from tqdm import tqdm
from typing import TextIO

from pybel import BELGraph, Manager
from pybel.cli import connection_option, graph_pickle_argument
from pybel.constants import CITATION, CITATION_DB, CITATION_DB_NAME, CITATION_IDENTIFIER, CITATION_TYPE_PUBMED
from pybel.manager.citation_utils import enrich_pubmed_citations


@click.command()
@connection_option
@graph_pickle_argument
@click.option('-o', '--output', type=click.File('w'), required=True)
@click.option('-t', '--threshold', type=int, default=1)
def main(connection: str, graph: BELGraph, output: TextIO, threshold):
"""Build a citation network from the graph."""
enrich_pubmed_citations(Manager(connection=connection), graph)
citation_network = make_citation_network(graph, threshold=threshold)
print('Source', 'Source Title', 'Target', 'Target Title', 'Shared', sep='\t', file=output)
for u, v, d in citation_network.edges(data=True):
print(
u,
citation_network.nodes[u]['title'],
v,
citation_network.nodes[v]['title'],
d['weight'],
sep='\t',
file=output,
)


def make_citation_network(bel_graph: BELGraph, threshold: int = 0) -> nx.Graph:
"""Make a citation network from the BEL graph based on which statements occur in multiple sourves."""
dd = defaultdict(set)
names = {}
for u, v, k, d in bel_graph.edges(keys=True, data=True):
citation = d.get(CITATION)
if citation is None or citation[CITATION_DB] != CITATION_TYPE_PUBMED:
continue
reference = citation[CITATION_IDENTIFIER]
dd[reference].update((u, v))
names[reference] = citation.get(CITATION_DB_NAME)

all_nodes = set(itt.chain.from_iterable(dd.values()))

iterator = itt.product(all_nodes, itt.combinations(dd.items(), r=2))
iterator = tqdm(iterator, total=len(all_nodes) * (len(dd) ** 2))
c = Counter(
(c1, c2)
for node, ((c1, c1_values), (c2, c2_values)) in iterator
if node in c1_values and node in c2_values
)

rv = nx.Graph()
for (c1, c2), weight in c.items():
if weight >= threshold:
rv.add_edge(c1, c2, weight=weight)

for reference, title in names.items():
rv.nodes[reference]['title'] = title

return rv


if __name__ == '__main__':
main()
3 changes: 3 additions & 0 deletions src/pybel_tools/curation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-

"""Scripts for curation."""
3 changes: 3 additions & 0 deletions src/pybel_tools/curation/planning/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-

"""Scripts for planning curation."""
82 changes: 82 additions & 0 deletions src/pybel_tools/curation/planning/check_novelties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-

"""This script assesses the novelty of pending curation tasks.

Currently, is limited to articles where PMC is available to ensure
good INDRA coverage.
"""

import json
import logging
from typing import Optional

import click
from easy_config.contrib.click import args_from_config
from gitlab.v4.objects import Issue, Project
from hbp_knowledge import get_graph
from pybel_git.gitlab import GitlabConfig

from pybel import BELGraph
from pybel_tools.assess_completeness import CompletenessSummary, assess_completeness
from ..recuration.utils import CURATION_LABEL

_prefix = '- PMID: ['


@click.command()
@args_from_config(GitlabConfig)
@click.option('-o', '--output', type=click.File('w'))
def main(project_id: int, url: str, token: str, output) -> None:
"""Assess the completeness of HBP curation tasks with respect to CONIB."""
logging.basicConfig(level=logging.INFO)
logging.getLogger('hbp').setLevel(logging.INFO)

gitlab_config = GitlabConfig.load( # noqa: S106
project_id=project_id,
url=url,
token=token,
)
project = gitlab_config.get_project()
do_it(project, output)


def do_it(project: Project, output):
graph = get_graph()

summaries = assess_project_completeness(project=project, graph=graph)

if output is not None:
json.dump(list(summaries), output, indent=2)
else:
for summary in summaries:
click.echo(json.dumps(summary, indent=2))


def assess_project_completeness(*, project: Project, graph: BELGraph):
"""Summarize thee novelty of all issues in the project."""
issues = project.issues.list(labels=[CURATION_LABEL])
for issue in issues:
click.echo(f'Issue {issue.id}: {issue.title}')
s = assess_issue_completeness(issue=issue, graph=graph)
d = s.summary_dict()
yield d


def assess_issue_completeness(*, issue: Issue, graph: BELGraph) -> CompletenessSummary:
"""Summarize the novelty of the PMID referenced by the issue."""
pmid = _get_pmid(issue.description)
ids = ('pmid', pmid)
return assess_completeness(ids, graph)


def _get_pmid(description: str) -> Optional[str]:
for line in description.split('\n'):
line = line.strip()
if line.startswith(_prefix):
line: str = line[len(_prefix):]
line = line[:line.index(']')]
return line


if __name__ == '__main__':
main()
86 changes: 86 additions & 0 deletions src/pybel_tools/curation/planning/go.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-

"""Curation tools for the Gene Ontology (GO).

Run with `python -m hbp.curation.planning.go`.
"""

from typing import List

import click
import requests

from ..utils import make_issues_from_pmids, min_year_option

url = 'http://golr-aux.geneontology.io/solr/select'

BASE_PARAMS = {
'defType': ['edismax'],
'qt': ['standard'],
'indent': ['on'],
'wt': ['csv'],
'rows': ['100000'],
'start': ['0'], 'fl': ['reference'],
'facet': ['true'],
'facet.mincount': ['1'],
'facet.sort': ['count'],
'json.nl': ['arrarr'],
'facet.limit': ['25'],
'hl': ['true'],
'hl.simple.pre': ['<em class="hilite">'],
'hl.snippets': ['1000'],
'csv.separator': ['\t'],
'csv.header': ['false'],
'csv.mv.separator': ['|'],
'fq': ['document_category:"annotation"'], # add bioentity here too
'facet.field': ['aspect', 'taxon_subset_closure_label', 'type', 'evidence_subset_closure_label',
'regulates_closure_label', 'annotation_class_label', 'qualifier',
'annotation_extension_class_closure_label', 'assigned_by', 'panther_family_label'],
'q': ['*:*'],
}


def get_pmids_from_go_annotations_by_uniprot_id(uniprot_id: str) -> List[str]:
"""Get the PubMed identifiers used in GO annotations for the given protein."""
params = BASE_PARAMS.copy()
params['fq'].append(f'bioentity:"UniProtKB:{uniprot_id}"')
r = requests.get(url, params)
lines = (
line.strip()
for line in r.text.splitlines()
)
return list(sorted({
line.split(':')[1]
for line in lines
if line and line.lower().startswith('pmid')
}))


@click.command()
@click.argument('uniprot_id')
@click.option('--namespace', type=click.Choice(['uniprot']), default='uniprot')
@min_year_option
@click.option('--make-issues', is_flag=True, help='Create issues on GitLab HBP repository')
@click.option('--allow-closed', is_flag=True, help='Allow publications that are not on PMC')
@click.option('-l', '--label', multiple=True)
def main(uniprot_id: str, namespace: str, min_year: int, make_issues: bool, allow_closed: bool, label: List[str]):
"""Get a list of documents for the given UniProt identifier.

Example: Q13148.
"""
if namespace == 'uniprot':
pmids = get_pmids_from_go_annotations_by_uniprot_id(uniprot_id)
else:
raise ValueError(f'{namespace} is not yet supported')

make_issues_from_pmids(
pmids,
min_year=min_year,
allow_closed=allow_closed,
make_issues=make_issues,
labels=label,
)


if __name__ == '__main__':
main()
59 changes: 59 additions & 0 deletions src/pybel_tools/curation/planning/pathways.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# -*- coding: utf-8 -*-

"""Curation tools for the Gene Ontology (GO).

Run with `python -m hbp.curation.planning.pathways`.
"""

from typing import Iterable, Optional

import click

import bio2bel_kegg
import bio2bel_reactome
import bio2bel_wikipathways
from compath_utils import CompathManager
from pybel.cli import connection_option


def get_managers(connection: Optional[str] = None) -> Iterable[CompathManager]:
wikipathways_manager = bio2bel_wikipathways.Manager(connection=connection)
if not wikipathways_manager.is_populated():
click.echo('WikiPathways is not populated')
else:
yield wikipathways_manager

reactome_manager = bio2bel_reactome.Manager(connection=connection)
if not reactome_manager.is_populated():
click.echo('Reactome is not populated')
else:
yield reactome_manager

kegg_manager = bio2bel_kegg.Manager(connection=connection)
if not kegg_manager.is_populated():
click.echo('KEGG is not populated')
else:
yield kegg_manager


@click.command()
@click.argument('name')
@click.option('--namespace', type=click.Choice('hgnc.symbol'), default='hgnc.symbol')
@connection_option
def main(name: str, namespace: str, connection: Optional[str]):
for manager in get_managers(connection):
if namespace == 'hgnc.symbol':
protein = manager.get_protein_by_hgnc_symbol(name)
else:
raise ValueError(f'{namespace} is not yet supported')

if protein is None:
click.echo(f'No pathways in {manager.module_name}')
else:
for pathway in protein.pathways:
pathway_id = getattr(pathway, f'{manager.module_name}_id')
click.echo(f'{manager.module_name}:{pathway_id} ! {pathway}')


if __name__ == '__main__':
main()
51 changes: 51 additions & 0 deletions src/pybel_tools/curation/planning/pubmed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-

"""Curation tools for PubMed.

Run with `python -m hbp.curation.planning.pubmed`.
"""

import sys
from typing import List

import click
from easy_config.contrib.click import args_from_config
from pybel_git.gitlab import GitlabConfig

from ..utils import make_issues_from_pmids, min_year_option


@click.command()
@args_from_config(GitlabConfig)
@click.option('-f', '--file', default=sys.stdin, type=click.File())
@min_year_option
@click.option('--make-issues', is_flag=True, help='Create issues on GitLab HBP repository')
@click.option('--allow-closed', is_flag=True, help='Allow publications that are not on PMC')
@click.option('-l', '--label', multiple=True)
def main(project_id: int, url: str, token: str, file, min_year: int, make_issues: bool, allow_closed: bool,
label: List[str]):
"""Get a list of documents by their PubMed identifiers."""
gitlab_config = GitlabConfig.load( # noqa: S106
project_id=project_id,
url=url,
token=token,
)
project = gitlab_config.get_project()

pmids = list(sorted({
line.strip()
for line in file
}))

make_issues_from_pmids(
project,
pmids,
min_year=min_year,
allow_closed=allow_closed,
make_issues=make_issues,
labels=label,
)


if __name__ == '__main__':
main()
Loading