pybel · cthoyt · Nov 6, 2019 · May 14, 2020 · May 14, 2020
diff --git a/src/pybel_tools/citation_coocurrence.py b/src/pybel_tools/citation_coocurrence.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+
+"""Build a network of citations connected by co-occurrence of entities."""
+
+from collections import Counter, defaultdict
+
+import click
+import itertools as itt
+import networkx as nx
+from tqdm import tqdm
+from typing import TextIO
+
+from pybel import BELGraph, Manager
+from pybel.cli import connection_option, graph_pickle_argument
+from pybel.constants import CITATION, CITATION_DB, CITATION_DB_NAME, CITATION_IDENTIFIER, CITATION_TYPE_PUBMED
+from pybel.manager.citation_utils import enrich_pubmed_citations
+
+
+@click.command()
+@connection_option
+@graph_pickle_argument
+@click.option('-o', '--output', type=click.File('w'), required=True)
+@click.option('-t', '--threshold', type=int, default=1)
+def main(connection: str, graph: BELGraph, output: TextIO, threshold):
+    """Build a citation network from the graph."""
+    enrich_pubmed_citations(Manager(connection=connection), graph)
+    citation_network = make_citation_network(graph, threshold=threshold)
+    print('Source', 'Source Title', 'Target', 'Target Title', 'Shared', sep='\t', file=output)
+    for u, v, d in citation_network.edges(data=True):
+        print(
+            u,
+            citation_network.nodes[u]['title'],
+            v,
+            citation_network.nodes[v]['title'],
+            d['weight'],
+            sep='\t',
+            file=output,
+        )
+
+
+def make_citation_network(bel_graph: BELGraph, threshold: int = 0) -> nx.Graph:
+    """Make a citation network from the BEL graph based on which statements occur in multiple sourves."""
+    dd = defaultdict(set)
+    names = {}
+    for u, v, k, d in bel_graph.edges(keys=True, data=True):
+        citation = d.get(CITATION)
+        if citation is None or citation[CITATION_DB] != CITATION_TYPE_PUBMED:
+            continue
+        reference = citation[CITATION_IDENTIFIER]
+        dd[reference].update((u, v))
+        names[reference] = citation.get(CITATION_DB_NAME)
+
+    all_nodes = set(itt.chain.from_iterable(dd.values()))
+
+    iterator = itt.product(all_nodes, itt.combinations(dd.items(), r=2))
+    iterator = tqdm(iterator, total=len(all_nodes) * (len(dd) ** 2))
+    c = Counter(
+        (c1, c2)
+        for node, ((c1, c1_values), (c2, c2_values)) in iterator
+        if node in c1_values and node in c2_values
+    )
+
+    rv = nx.Graph()
+    for (c1, c2), weight in c.items():
+        if weight >= threshold:
+            rv.add_edge(c1, c2, weight=weight)
+
+    for reference, title in names.items():
+        rv.nodes[reference]['title'] = title
+
+    return rv
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/pybel_tools/curation/__init__.py b/src/pybel_tools/curation/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+"""Scripts for curation."""
diff --git a/src/pybel_tools/curation/planning/__init__.py b/src/pybel_tools/curation/planning/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+"""Scripts for planning curation."""
diff --git a/src/pybel_tools/curation/planning/check_novelties.py b/src/pybel_tools/curation/planning/check_novelties.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+
+"""This script assesses the novelty of pending curation tasks.
+
+Currently, is limited to articles where PMC is available to ensure
+good INDRA coverage.
+"""
+
+import json
+import logging
+from typing import Optional
+
+import click
+from easy_config.contrib.click import args_from_config
+from gitlab.v4.objects import Issue, Project
+from hbp_knowledge import get_graph
+from pybel_git.gitlab import GitlabConfig
+
+from pybel import BELGraph
+from pybel_tools.assess_completeness import CompletenessSummary, assess_completeness
+from ..recuration.utils import CURATION_LABEL
+
+_prefix = '- PMID: ['
+
+
+@click.command()
+@args_from_config(GitlabConfig)
+@click.option('-o', '--output', type=click.File('w'))
+def main(project_id: int, url: str, token: str, output) -> None:
+    """Assess the completeness of HBP curation tasks with respect to CONIB."""
+    logging.basicConfig(level=logging.INFO)
+    logging.getLogger('hbp').setLevel(logging.INFO)
+
+    gitlab_config = GitlabConfig.load(  # noqa: S106
+        project_id=project_id,
+        url=url,
+        token=token,
+    )
+    project = gitlab_config.get_project()
+    do_it(project, output)
+
+
+def do_it(project: Project, output):
+    graph = get_graph()
+
+    summaries = assess_project_completeness(project=project, graph=graph)
+
+    if output is not None:
+        json.dump(list(summaries), output, indent=2)
+    else:
+        for summary in summaries:
+            click.echo(json.dumps(summary, indent=2))
+
+
+def assess_project_completeness(*, project: Project, graph: BELGraph):
+    """Summarize thee novelty of all issues in the project."""
+    issues = project.issues.list(labels=[CURATION_LABEL])
+    for issue in issues:
+        click.echo(f'Issue {issue.id}: {issue.title}')
+        s = assess_issue_completeness(issue=issue, graph=graph)
+        d = s.summary_dict()
+        yield d
+
+
+def assess_issue_completeness(*, issue: Issue, graph: BELGraph) -> CompletenessSummary:
+    """Summarize the novelty of the PMID referenced by the issue."""
+    pmid = _get_pmid(issue.description)
+    ids = ('pmid', pmid)
+    return assess_completeness(ids, graph)
+
+
+def _get_pmid(description: str) -> Optional[str]:
+    for line in description.split('\n'):
+        line = line.strip()
+        if line.startswith(_prefix):
+            line: str = line[len(_prefix):]
+            line = line[:line.index(']')]
+            return line
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/pybel_tools/curation/planning/go.py b/src/pybel_tools/curation/planning/go.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+"""Curation tools for the Gene Ontology (GO).
+
+Run with `python -m hbp.curation.planning.go`.
+"""
+
+from typing import List
+
+import click
+import requests
+
+from ..utils import make_issues_from_pmids, min_year_option
+
+url = 'http://golr-aux.geneontology.io/solr/select'
+
+BASE_PARAMS = {
+    'defType': ['edismax'],
+    'qt': ['standard'],
+    'indent': ['on'],
+    'wt': ['csv'],
+    'rows': ['100000'],
+    'start': ['0'], 'fl': ['reference'],
+    'facet': ['true'],
+    'facet.mincount': ['1'],
+    'facet.sort': ['count'],
+    'json.nl': ['arrarr'],
+    'facet.limit': ['25'],
+    'hl': ['true'],
+    'hl.simple.pre': ['<em class="hilite">'],
+    'hl.snippets': ['1000'],
+    'csv.separator': ['\t'],
+    'csv.header': ['false'],
+    'csv.mv.separator': ['|'],
+    'fq': ['document_category:"annotation"'],  # add bioentity here too
+    'facet.field': ['aspect', 'taxon_subset_closure_label', 'type', 'evidence_subset_closure_label',
+                    'regulates_closure_label', 'annotation_class_label', 'qualifier',
+                    'annotation_extension_class_closure_label', 'assigned_by', 'panther_family_label'],
+    'q': ['*:*'],
+}
+
+
+def get_pmids_from_go_annotations_by_uniprot_id(uniprot_id: str) -> List[str]:
+    """Get the PubMed identifiers used in GO annotations for the given protein."""
+    params = BASE_PARAMS.copy()
+    params['fq'].append(f'bioentity:"UniProtKB:{uniprot_id}"')
+    r = requests.get(url, params)
+    lines = (
+        line.strip()
+        for line in r.text.splitlines()
+    )
+    return list(sorted({
+        line.split(':')[1]
+        for line in lines
+        if line and line.lower().startswith('pmid')
+    }))
+
+
+@click.command()
+@click.argument('uniprot_id')
+@click.option('--namespace', type=click.Choice(['uniprot']), default='uniprot')
+@min_year_option
+@click.option('--make-issues', is_flag=True, help='Create issues on GitLab HBP repository')
+@click.option('--allow-closed', is_flag=True, help='Allow publications that are not on PMC')
+@click.option('-l', '--label', multiple=True)
+def main(uniprot_id: str, namespace: str, min_year: int, make_issues: bool, allow_closed: bool, label: List[str]):
+    """Get a list of documents for the given UniProt identifier.
+
+    Example: Q13148.
+    """
+    if namespace == 'uniprot':
+        pmids = get_pmids_from_go_annotations_by_uniprot_id(uniprot_id)
+    else:
+        raise ValueError(f'{namespace} is not yet supported')
+
+    make_issues_from_pmids(
+        pmids,
+        min_year=min_year,
+        allow_closed=allow_closed,
+        make_issues=make_issues,
+        labels=label,
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/pybel_tools/curation/planning/pathways.py b/src/pybel_tools/curation/planning/pathways.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+
+"""Curation tools for the Gene Ontology (GO).
+
+Run with `python -m hbp.curation.planning.pathways`.
+"""
+
+from typing import Iterable, Optional
+
+import click
+
+import bio2bel_kegg
+import bio2bel_reactome
+import bio2bel_wikipathways
+from compath_utils import CompathManager
+from pybel.cli import connection_option
+
+
+def get_managers(connection: Optional[str] = None) -> Iterable[CompathManager]:
+    wikipathways_manager = bio2bel_wikipathways.Manager(connection=connection)
+    if not wikipathways_manager.is_populated():
+        click.echo('WikiPathways is not populated')
+    else:
+        yield wikipathways_manager
+
+    reactome_manager = bio2bel_reactome.Manager(connection=connection)
+    if not reactome_manager.is_populated():
+        click.echo('Reactome is not populated')
+    else:
+        yield reactome_manager
+
+    kegg_manager = bio2bel_kegg.Manager(connection=connection)
+    if not kegg_manager.is_populated():
+        click.echo('KEGG is not populated')
+    else:
+        yield kegg_manager
+
+
+@click.command()
+@click.argument('name')
+@click.option('--namespace', type=click.Choice('hgnc.symbol'), default='hgnc.symbol')
+@connection_option
+def main(name: str, namespace: str, connection: Optional[str]):
+    for manager in get_managers(connection):
+        if namespace == 'hgnc.symbol':
+            protein = manager.get_protein_by_hgnc_symbol(name)
+        else:
+            raise ValueError(f'{namespace} is not yet supported')
+
+        if protein is None:
+            click.echo(f'No pathways in {manager.module_name}')
+        else:
+            for pathway in protein.pathways:
+                pathway_id = getattr(pathway, f'{manager.module_name}_id')
+                click.echo(f'{manager.module_name}:{pathway_id} ! {pathway}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/pybel_tools/curation/planning/pubmed.py b/src/pybel_tools/curation/planning/pubmed.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+"""Curation tools for PubMed.
+
+Run with `python -m hbp.curation.planning.pubmed`.
+"""
+
+import sys
+from typing import List
+
+import click
+from easy_config.contrib.click import args_from_config
+from pybel_git.gitlab import GitlabConfig
+
+from ..utils import make_issues_from_pmids, min_year_option
+
+
+@click.command()
+@args_from_config(GitlabConfig)
+@click.option('-f', '--file', default=sys.stdin, type=click.File())
+@min_year_option
+@click.option('--make-issues', is_flag=True, help='Create issues on GitLab HBP repository')
+@click.option('--allow-closed', is_flag=True, help='Allow publications that are not on PMC')
+@click.option('-l', '--label', multiple=True)
+def main(project_id: int, url: str, token: str, file, min_year: int, make_issues: bool, allow_closed: bool,
+         label: List[str]):
+    """Get a list of documents by their PubMed identifiers."""
+    gitlab_config = GitlabConfig.load(  # noqa: S106
+        project_id=project_id,
+        url=url,
+        token=token,
+    )
+    project = gitlab_config.get_project()
+
+    pmids = list(sorted({
+        line.strip()
+        for line in file
+    }))
+
+    make_issues_from_pmids(
+        project,
+        pmids,
+        min_year=min_year,
+        allow_closed=allow_closed,
+        make_issues=make_issues,
+        labels=label,
+    )
+
+
+if __name__ == '__main__':
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# -- coding: utf-8 --

		"""Scripts for curation."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# -- coding: utf-8 --

		"""Scripts for planning curation."""