From 700fb6450632ff07c6fd84d246eaa53dac5b236d Mon Sep 17 00:00:00 2001 From: Eugenia Oshurko Date: Fri, 30 Jul 2021 14:56:39 +0200 Subject: [PATCH] Hotfix/missing dependency (colab fails) (#82) * Resolved unnecessary dependency of preprocessing on downstream * Added python_louvain fix * Moved 'from_ontology' to the abstract PGFrame * Bugfix in pandas aggregate (changed behaviour for 1.3.X) --- .../backends/networkx/analyse/communities.py | 2 +- bluegraph/core/embed/embedders.py | 28 +- bluegraph/core/io.py | 189 ++-- bluegraph/core/utils.py | 21 + bluegraph/downstream/__init__.py | 2 - bluegraph/downstream/data_structures.py | 39 - bluegraph/preprocess/encoders.py | 4 +- bluegraph/preprocess/utils.py | 2 +- cord19kg/apps/visualization_app.py | 1 - ...isk facor for COVID-19 (3000 papers).ipynb | 816 +++++++++++++++++- cord19kg/utils.py | 17 +- requirements.txt | 1 + tests/downstream/test_link_prediction.py | 4 +- 13 files changed, 922 insertions(+), 204 deletions(-) diff --git a/bluegraph/backends/networkx/analyse/communities.py b/bluegraph/backends/networkx/analyse/communities.py index 23708fe6..6b3bb921 100644 --- a/bluegraph/backends/networkx/analyse/communities.py +++ b/bluegraph/backends/networkx/analyse/communities.py @@ -18,7 +18,7 @@ from operator import itemgetter from functools import partial -import community as community_louvain +import community.community_louvain as community_louvain from networkx.algorithms.community.centrality import girvan_newman from networkx.algorithms.community.label_propagation import asyn_lpa_communities from networkx.algorithms.community.quality import (performance, diff --git a/bluegraph/core/embed/embedders.py b/bluegraph/core/embed/embedders.py index 4b92d0e8..1bcd3e27 100644 --- a/bluegraph/core/embed/embedders.py +++ b/bluegraph/core/embed/embedders.py @@ -25,11 +25,29 @@ from bluegraph.exceptions import BlueGraphException, BlueGraphWarning - DEFAULT_EMBEDDING_DIMENSION = 64 -class GraphElementEmbedder(ABC): +class Embedder(ABC): + """Embedder inferface for EmbeddingPipeline.""" + + @abstractmethod + def info(self): + """Get dictionary with the info.""" + pass + + @abstractmethod + def fit_model(self, data, **kwargs): + """Train specified model on the provided data.""" + pass + + @abstractmethod + def predict_embeddings(self, data=None, **kwargs): + """Predict embeddings of out-sample elements.""" + pass + + +class GraphElementEmbedder(Embedder): """Abstract class for a node/edge embedder.""" @property @@ -80,7 +98,7 @@ def __init__(self, model_name, directed=True, include_type=False, """Initialize StellarGraphEmbedder.""" if model_name.lower() not in self._transductive_models and\ model_name.lower() not in self._inductive_models: - raise ElementEmbedder.InvalidModelException( + raise GraphElementEmbedder.InvalidModelException( f"Embedding model '{model_name.lower()}' is not implemented " f"for {self.__class__.__name__}") @@ -164,7 +182,7 @@ def predict_embeddings(self, pgframe, nodes=None): if nodes is None: nodes = pgframe.nodes() if self._embedding_model is None: - raise ElementEmbedder.PredictionException( + raise GraphElementEmbedder.PredictionException( "Embedder does not have a predictive model") input_graph = self._generate_graph(pgframe, self.graph_configs) @@ -241,6 +259,6 @@ class PredictionException(BlueGraphException): pass -class GraphEmbedder(ABC): +class GraphEmbedder(Embedder): """Abstract class for a graph embedder.""" pass diff --git a/bluegraph/core/io.py b/bluegraph/core/io.py index b8c63a31..3f4e8af0 100644 --- a/bluegraph/core/io.py +++ b/bluegraph/core/io.py @@ -463,12 +463,13 @@ def from_jsonld(self, resources, include_context=True, type_handler=None, resources : iterable of dict Collection of input resources in JSON-LD format include_context : bool, optional - Flag indicating if the context should be included as a property. Default is True. + Flag indicating if the context should be included as a property. + Default is True. type_handler : func, optional Function to apply to the value of type (e.g. '@type') types_from_relations : bool, optional - Flag indicating if resources with unkown types should be assigned with types - from the incoming relations. Default is True + Flag indicating if resources with unkown types should be assigned + with types from the incoming relations. Default is True exclude : list of str, optional Collection of property names to exclude. Default is empty. only_props : bool, optional @@ -613,6 +614,97 @@ def density(self, directed=True): return self.number_of_edges() / total_edges + @classmethod + def from_ontology(cls, filepath=None, rdf_graph=None, format="turtle", + remove_prop_uris=False): + """Create a PandasPGFrame from ontology.""" + if filepath is None and rdf_graph is None: + raise ValueError( + "Ontology source must be specified: both " + "'filepath' or 'rdf_graph' are None") + + if rdf_graph is None: + g = rdflib.Graph() + g.parse(filepath, format=format) + else: + g = rdf_graph + + classes = set() + individuals = set() + + # Extract classes and individuals as nodes + for s in g.subjects(RDF.type, OWL.Class): + if g.label(s): + classes.add(s) + for s in g.subjects(RDF.type, OWL.NamedIndividual): + if g.label(s): + individuals.add(s) + + edges = defaultdict(set) + props = defaultdict(dict) + for c in classes: + node_id = g.label(c).value + for p, o in g.predicate_objects(c): + if isinstance(o, rdflib.Literal): + prop_name = None + if g.label(p): + prop_name = g.label(p).value + else: + prop_name = str(p) + if node_id in props[prop_name]: + if isinstance(props[prop_name][node_id], list): + props[prop_name][node_id].append(o.value) + else: + props[prop_name][node_id] = [ + props[prop_name][node_id], o.value + ] + else: + props[prop_name][node_id] = o.value + else: + source_node = g.label(c).value + if p.eq(RDFS.subClassOf): + if isinstance(o, rdflib.BNode): + target_node = None + for oo in g.objects(o, OWL.someValuesFrom): + if g.label(oo): + target_node = g.label(oo).value + for oo in g.objects(o, OWL.onProperty): + if g.label(oo): + edge_label = g.label(oo).value + if target_node: + edges[(source_node, target_node)].add(edge_label) + else: + if g.label(o): + edges[(source_node, g.label(o).value)].add( + "IS_SUBCLASS_OF") + elif not p.eq(RDF.type): + target_node = g.label(o) + if target_node: + edges[(source_node, target_node.value)].add(str(p)) + + # Create a PGFrame + graph = cls( + nodes=[g.label(el).value for el in classes.union(individuals)], + edges=list(edges.keys())) + graph.add_edge_types(edges) + + # Add node props + for k, v in props.items(): + graph.add_node_properties( + pd.DataFrame(v.items(), columns=["@id", k]) + ) + + # Remove uri's from property names: 'http://...#prop' becomes 'prop' + if remove_prop_uris: + mapping = {} + for p in graph.node_properties(): + match = re.match(r"(http:\/\/.*)#(.*)", p) + if match: + mapping[p] = match.groups()[1] + graph.rename_node_properties(mapping) + + return graph + class PGFrameException(BlueGraphException): pass @@ -1239,97 +1331,6 @@ def rename_edge_properties(self, mapping): del self._edge_prop_types[k] self._edge_prop_types.update(new_typing) - @classmethod - def from_ontology(cls, filepath=None, rdf_graph=None, format="turtle", - remove_prop_uris=False): - """Create a PandasPGFrame from ontology.""" - if filepath is None and rdf_graph is None: - raise ValueError( - "Ontology source must be specified: both " - "'filepath' or 'rdf_graph' are None") - - if rdf_graph is None: - g = rdflib.Graph() - g.parse(filepath, format=format) - else: - g = rdf_graph - - classes = set() - individuals = set() - - # Extract classes and individuals as nodes - for s in g.subjects(RDF.type, OWL.Class): - if g.label(s): - classes.add(s) - for s in g.subjects(RDF.type, OWL.NamedIndividual): - if g.label(s): - individuals.add(s) - - edges = defaultdict(set) - props = defaultdict(dict) - for c in classes: - node_id = g.label(c).value - for p, o in g.predicate_objects(c): - if isinstance(o, rdflib.Literal): - prop_name = None - if g.label(p): - prop_name = g.label(p).value - else: - prop_name = str(p) - if node_id in props[prop_name]: - if isinstance(props[prop_name][node_id], list): - props[prop_name][node_id].append(o.value) - else: - props[prop_name][node_id] = [ - props[prop_name][node_id], o.value - ] - else: - props[prop_name][node_id] = o.value - else: - source_node = g.label(c).value - if p.eq(RDFS.subClassOf): - if isinstance(o, rdflib.BNode): - target_node = None - for oo in g.objects(o, OWL.someValuesFrom): - if g.label(oo): - target_node = g.label(oo).value - for oo in g.objects(o, OWL.onProperty): - if g.label(oo): - edge_label = g.label(oo).value - if target_node: - edges[(source_node, target_node)].add(edge_label) - else: - if g.label(o): - edges[(source_node, g.label(o).value)].add( - "IS_SUBCLASS_OF") - elif not p.eq(RDF.type): - target_node = g.label(o) - if target_node: - edges[(source_node, target_node.value)].add(str(p)) - - # Create a PGFrame - graph = cls( - nodes=[g.label(el).value for el in classes.union(individuals)], - edges=list(edges.keys())) - graph.add_edge_types(edges) - - # Add node props - for k, v in props.items(): - graph.add_node_properties( - pd.DataFrame(v.items(), columns=["@id", k]) - ) - - # Remove uri's from property names: 'http://...#prop' becomes 'prop' - if remove_prop_uris: - mapping = {} - for p in graph.node_properties(): - match = re.match(r"(http:\/\/.*)#(.*)", p) - if match: - mapping[p] = match.groups()[1] - graph.rename_node_properties(mapping) - - return graph - class SparkPGFrame(PGFrame): """Class for storing typed PGs as a collection of Spark DataFrames.""" diff --git a/bluegraph/core/utils.py b/bluegraph/core/utils.py index 8000128f..8517c5a8 100644 --- a/bluegraph/core/utils.py +++ b/bluegraph/core/utils.py @@ -13,6 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from abc import ABC, abstractmethod + import math import pandas as pd @@ -74,3 +76,22 @@ def top_n(data_dict, n, smallest=False): else: df = df.nlargest(n, columns=["value"]) return(list(df["id"])) + + +class Preprocessor(ABC): + """Preprocessor inferface for EmbeddingPipeline.""" + + @abstractmethod + def info(self): + """Get dictionary with the info.""" + pass + + @abstractmethod + def fit(self, data, **kwargs): + """Fit the preprocessor.""" + pass + + @abstractmethod + def transform(self, data, **kwargs): + """Tranform the input data.""" + pass diff --git a/bluegraph/downstream/__init__.py b/bluegraph/downstream/__init__.py index 8bb0307e..98675468 100644 --- a/bluegraph/downstream/__init__.py +++ b/bluegraph/downstream/__init__.py @@ -14,7 +14,5 @@ # See the License for the specific language governing permissions and # limitations under the License. from .data_structures import (ElementClassifier, - Preprocessor, - Embedder, EmbeddingPipeline) from .utils import * \ No newline at end of file diff --git a/bluegraph/downstream/data_structures.py b/bluegraph/downstream/data_structures.py index d8c5856c..c7e61bd6 100644 --- a/bluegraph/downstream/data_structures.py +++ b/bluegraph/downstream/data_structures.py @@ -22,7 +22,6 @@ import re import pickle import shutil -import warnings from bluegraph.exceptions import BlueGraphException, BlueGraphWarning from .similarity import SimilarityProcessor @@ -93,44 +92,6 @@ def predict(self, pgframe, predict_elements=None): return self.model.predict(data) -class Preprocessor(ABC): - """Preprocessor inferface for EmbeddingPipeline.""" - - @abstractmethod - def info(self): - """Get dictionary with the info.""" - pass - - @abstractmethod - def fit(self, data, **kwargs): - """Fit the preprocessor.""" - pass - - @abstractmethod - def transform(self, data, **kwargs): - """Tranform the input data.""" - pass - - -class Embedder(ABC): - """Embedder inferface for EmbeddingPipeline.""" - - @abstractmethod - def info(self): - """Get dictionary with the info.""" - pass - - @abstractmethod - def fit_model(self, data, **kwargs): - """Train specified model on the provided data.""" - pass - - @abstractmethod - def predict_embeddings(self, data=None, **kwargs): - """Predict embeddings of out-sample elements.""" - pass - - class EmbeddingPipeline(object): """Data structure for stacking embedding pipelines. diff --git a/bluegraph/preprocess/encoders.py b/bluegraph/preprocess/encoders.py index 366f8d78..06b1b13b 100644 --- a/bluegraph/preprocess/encoders.py +++ b/bluegraph/preprocess/encoders.py @@ -30,12 +30,12 @@ Doc2VecEncoder, _get_encoder_type, _generate_type_repr) -from bluegraph.core.utils import normalize_to_set +from bluegraph.core.utils import normalize_to_set, Preprocessor from bluegraph.core.io import PandasPGFrame from bluegraph.exceptions import BlueGraphException -class SemanticPGEncoder(ABC): +class SemanticPGEncoder(Preprocessor): """Abstract class for semantic property graph encoder. The encoder provides a wrapper for multiple heterogeneous diff --git a/bluegraph/preprocess/utils.py b/bluegraph/preprocess/utils.py index 52db0a14..28a03081 100644 --- a/bluegraph/preprocess/utils.py +++ b/bluegraph/preprocess/utils.py @@ -26,7 +26,7 @@ from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.feature_extraction.text import TfidfVectorizer -from bluegraph.downstream.data_structures import Preprocessor +from bluegraph.core.utils import Preprocessor def _get_encoder_type(pgframe, prop, is_edge=False): diff --git a/cord19kg/apps/visualization_app.py b/cord19kg/apps/visualization_app.py index 58b32bdf..167b85cc 100644 --- a/cord19kg/apps/visualization_app.py +++ b/cord19kg/apps/visualization_app.py @@ -36,7 +36,6 @@ from dash.exceptions import PreventUpdate from bluegraph.core.utils import top_n -from bluegraph.backends.networkx import pgframe_to_networkx import cord19kg from cord19kg.apps.resources import (CYTOSCAPE_STYLE_STYLESHEET, diff --git a/cord19kg/examples/notebooks/Glucose is a risk facor for COVID-19 (3000 papers).ipynb b/cord19kg/examples/notebooks/Glucose is a risk facor for COVID-19 (3000 papers).ipynb index f1c895ff..9d8a7d4b 100644 --- a/cord19kg/examples/notebooks/Glucose is a risk facor for COVID-19 (3000 papers).ipynb +++ b/cord19kg/examples/notebooks/Glucose is a risk facor for COVID-19 (3000 papers).ipynb @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -70,9 +70,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data path: '../data/'\n" + ] + } + ], "source": [ "try:\n", " print(f\"Data path: '{DATA_PATH}'\")\n", @@ -83,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -102,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -115,9 +123,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading the file to '../data/Glucose_risk_3000_papers.csv.zip'\n", + "Decompressing ...\n", + "Done.\n", + "CPU times: user 4.04 s, sys: 413 ms, total: 4.45 s\n", + "Wall time: 7.08 s\n" + ] + } + ], "source": [ "%%time\n", "download_from_nexus(\n", @@ -131,9 +151,91 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
entityentity_typeoccurrence
876465allergicDISEASE13187:878:1157
2338378COVID-19DISEASE164361:6. Chronic Low-Grade Subclinical System...
1758577SARA ataxiaDISEASE21644:Risk Factors And Prognosis In Isolated I...
1656925CsADRUG21415:Abstracts:65
1692416fluoroquinoloneCHEMICAL21542:P835 Adequacy And Efficacy Of Initial Em...
\n", + "
" + ], + "text/plain": [ + " entity entity_type \\\n", + "876465 allergic DISEASE \n", + "2338378 COVID-19 DISEASE \n", + "1758577 SARA ataxia DISEASE \n", + "1656925 CsA DRUG \n", + "1692416 fluoroquinolone CHEMICAL \n", + "\n", + " occurrence \n", + "876465 13187:878:1157 \n", + "2338378 164361:6. Chronic Low-Grade Subclinical System... \n", + "1758577 21644:Risk Factors And Prognosis In Isolated I... \n", + "1656925 21415:Abstracts:65 \n", + "1692416 21542:P835 Adequacy And Efficacy Of Initial Em... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data.sample(5)" ] @@ -147,9 +249,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prepating curatation data...\n", + "Cleaning up the entities...\n", + "Aggregating occurrences of entities....\n", + "Done.\n", + "CPU times: user 27.8 s, sys: 978 ms, total: 28.8 s\n", + "Wall time: 29 s\n" + ] + } + ], "source": [ "%%time\n", "print(\"Prepating curatation data...\")\n", @@ -171,9 +286,142 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
entityentity_typeparagraphpapersectionpaper_frequencyraw_entity_typesraw_frequency
67923pprsPROTEIN[21939:Evaluation Of Host Immune System Evasio...[186907, 206417, 21939][21939:Evaluation Of Host Immune System Evasio...3[PROTEIN, PROTEIN, PROTEIN]3
14943cd14 polymorphismsPROTEIN[14524:33:190, 16006:Myocardial Infarction :::...[14524, 16006][14524:33, 16006:Myocardial Infarction ]2[PROTEIN, PROTEIN]2
18261ciaDISEASE[6116:Center Of Biomedical Technology, Danube ...[6148, 14524, 21542, 6116, 6209, 16006, 13187,...[8327:Or-05. Synovial Intracellular Citrullina...12[DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, ...87
58959nonconvulsive status epilepticusDISEASE[7093:Introduction:1136, 13098:Results ::: Neu...[7093, 13098][13098:Results , 7093:Introduction]2[DISEASE, DISEASE]2
44483intestinal mucosaORGAN[13320:Caption:797, 13218:Effect:806, 6116:Edu...[16648, 21842, 7121, 17184, 21927, 225920, 218...[7121:Pathogenic Role Of Il-15 In Intestinal I...51[ORGAN, ORGAN, ORGAN, ORGAN, ORGAN, ORGAN, ORG...83
\n", + "
" + ], + "text/plain": [ + " entity entity_type \\\n", + "67923 pprs PROTEIN \n", + "14943 cd14 polymorphisms PROTEIN \n", + "18261 cia DISEASE \n", + "58959 nonconvulsive status epilepticus DISEASE \n", + "44483 intestinal mucosa ORGAN \n", + "\n", + " paragraph \\\n", + "67923 [21939:Evaluation Of Host Immune System Evasio... \n", + "14943 [14524:33:190, 16006:Myocardial Infarction :::... \n", + "18261 [6116:Center Of Biomedical Technology, Danube ... \n", + "58959 [7093:Introduction:1136, 13098:Results ::: Neu... \n", + "44483 [13320:Caption:797, 13218:Effect:806, 6116:Edu... \n", + "\n", + " paper \\\n", + "67923 [186907, 206417, 21939] \n", + "14943 [14524, 16006] \n", + "18261 [6148, 14524, 21542, 6116, 6209, 16006, 13187,... \n", + "58959 [7093, 13098] \n", + "44483 [16648, 21842, 7121, 17184, 21927, 225920, 218... \n", + "\n", + " section paper_frequency \\\n", + "67923 [21939:Evaluation Of Host Immune System Evasio... 3 \n", + "14943 [14524:33, 16006:Myocardial Infarction ] 2 \n", + "18261 [8327:Or-05. Synovial Intracellular Citrullina... 12 \n", + "58959 [13098:Results , 7093:Introduction] 2 \n", + "44483 [7121:Pathogenic Role Of Il-15 In Intestinal I... 51 \n", + "\n", + " raw_entity_types raw_frequency \n", + "67923 [PROTEIN, PROTEIN, PROTEIN] 3 \n", + "14943 [PROTEIN, PROTEIN] 2 \n", + "18261 [DISEASE, DISEASE, DISEASE, DISEASE, DISEASE, ... 87 \n", + "58959 [DISEASE, DISEASE] 2 \n", + "44483 [ORGAN, ORGAN, ORGAN, ORGAN, ORGAN, ORGAN, ORG... 83 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "curation_input_table.sample(5)" ] @@ -187,9 +435,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'paper': 3000, 'section': 53947, 'paragraph': 211380}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "factor_counts" ] @@ -219,9 +478,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading the ontology linking data...\n", + "Downloading the file to '../data/NCIT_ontology_linking_3000_papers.csv.zip'\n", + "Decompressing ...\n", + "\tLoading the linking dataframe in memory...\n", + "\tLoading ontology type mapping...\n", + "Downloading the file to '../data/NCIT_type_mapping.json'\n", + "Done.\n", + "CPU times: user 2.06 s, sys: 294 ms, total: 2.35 s\n", + "Wall time: 6.6 s\n" + ] + } + ], "source": [ "%%time\n", "print(\"Loading the ontology linking data...\")\n", @@ -258,11 +533,126 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mentionconceptuiddefinitiontaxonomy
47427bronchial epithelium metaplasiabronchial epitheliumhttp://purl.obolibrary.org/obo/NCIT_C32231The epithelium of the bronchus of the lung, wh...[('http://purl.obolibrary.org/obo/NCIT_C13181'...
92753matrix metalloproteinases (mmp) 1 and 3matrix metalloproteinasehttp://purl.obolibrary.org/obo/NCIT_C18164Matrix metalloproteinases are Zn(2+)-binding e...[('http://purl.obolibrary.org/obo/NCIT_C16843'...
124737supercomplex i+iiinadh dehydrogenase (ubiquinone)http://purl.obolibrary.org/obo/NCIT_C118184A complex of over 40 proteins found in the inn...[('http://purl.obolibrary.org/obo/NCIT_C21176'...
96732ox-rr-gemox regimenhttp://purl.obolibrary.org/obo/NCIT_C140731A regimen consisting of rituximab, gemcitabine...[('http://purl.obolibrary.org/obo/NCIT_C63442'...
44527geometric distortiongeometric distributionhttp://purl.obolibrary.org/obo/NCIT_C53211A discrete, statistical distribution which is ...[('http://purl.obolibrary.org/obo/NCIT_C53207'...
\n", + "
" + ], + "text/plain": [ + " mention \\\n", + "47427 bronchial epithelium metaplasia \n", + "92753 matrix metalloproteinases (mmp) 1 and 3 \n", + "124737 supercomplex i+iii \n", + "96732 ox-r \n", + "44527 geometric distortion \n", + "\n", + " concept \\\n", + "47427 bronchial epithelium \n", + "92753 matrix metalloproteinase \n", + "124737 nadh dehydrogenase (ubiquinone) \n", + "96732 r-gemox regimen \n", + "44527 geometric distribution \n", + "\n", + " uid \\\n", + "47427 http://purl.obolibrary.org/obo/NCIT_C32231 \n", + "92753 http://purl.obolibrary.org/obo/NCIT_C18164 \n", + "124737 http://purl.obolibrary.org/obo/NCIT_C118184 \n", + "96732 http://purl.obolibrary.org/obo/NCIT_C140731 \n", + "44527 http://purl.obolibrary.org/obo/NCIT_C53211 \n", + "\n", + " definition \\\n", + "47427 The epithelium of the bronchus of the lung, wh... \n", + "92753 Matrix metalloproteinases are Zn(2+)-binding e... \n", + "124737 A complex of over 40 proteins found in the inn... \n", + "96732 A regimen consisting of rituximab, gemcitabine... \n", + "44527 A discrete, statistical distribution which is ... \n", + "\n", + " taxonomy \n", + "47427 [('http://purl.obolibrary.org/obo/NCIT_C13181'... \n", + "92753 [('http://purl.obolibrary.org/obo/NCIT_C16843'... \n", + "124737 [('http://purl.obolibrary.org/obo/NCIT_C21176'... \n", + "96732 [('http://purl.obolibrary.org/obo/NCIT_C63442'... \n", + "44527 [('http://purl.obolibrary.org/obo/NCIT_C53207'... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "ontology_linking.sample(5)" ] @@ -289,7 +679,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -305,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -322,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -345,9 +735,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "curation_app.run(port=8074, mode=\"inline\")" ] @@ -361,9 +772,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging the occurrence data with the ontology linking...\n" + ] + } + ], "source": [ "# curation_app.run(port=8070, mode=\"external\")" ] @@ -384,9 +803,138 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
papersectionparagraphaggregated_entitiesuiddefinitionpaper_frequencyentity_type
entity
silicon dioxide{21944, 185530, 191297, 21652, 21927, 207682, ...{7094:Caption, 7045:A-250 16, 21673:P26.36, 51...{5167:A. Jaworski And H. Brückner:12, 7094:Cap...[silica]http://purl.obolibrary.org/obo/NCIT_C29853A natural compound of silicon and oxygen found...30CHEMICAL
coronary artery{13852, 54259, 3071, 161523, 86688, 218781, 21...{194234:Caption, 1030:B-500 11, 56:Case Inform...{14045:0401:525, 193515::2, 28474:Conclusions:...[coronary arterial, coronary arteries, coronar...http://purl.obolibrary.org/obo/NCIT_C12843A principal artery that originates in the aort...380ORGAN
sxt{14321, 21573, 8225, 21542, 18225, 21939, 9769...{18225:P1068 Antimicrobial Susceptibility Patt...{18225:P1687 Current Profile Of Respiratory Tr...[sxt]NaNNaN9CHEMICAL
\n", + "
" + ], + "text/plain": [ + " paper \\\n", + "entity \n", + "silicon dioxide {21944, 185530, 191297, 21652, 21927, 207682, ... \n", + "coronary artery {13852, 54259, 3071, 161523, 86688, 218781, 21... \n", + "sxt {14321, 21573, 8225, 21542, 18225, 21939, 9769... \n", + "\n", + " section \\\n", + "entity \n", + "silicon dioxide {7094:Caption, 7045:A-250 16, 21673:P26.36, 51... \n", + "coronary artery {194234:Caption, 1030:B-500 11, 56:Case Inform... \n", + "sxt {18225:P1068 Antimicrobial Susceptibility Patt... \n", + "\n", + " paragraph \\\n", + "entity \n", + "silicon dioxide {5167:A. Jaworski And H. Brückner:12, 7094:Cap... \n", + "coronary artery {14045:0401:525, 193515::2, 28474:Conclusions:... \n", + "sxt {18225:P1687 Current Profile Of Respiratory Tr... \n", + "\n", + " aggregated_entities \\\n", + "entity \n", + "silicon dioxide [silica] \n", + "coronary artery [coronary arterial, coronary arteries, coronar... \n", + "sxt [sxt] \n", + "\n", + " uid \\\n", + "entity \n", + "silicon dioxide http://purl.obolibrary.org/obo/NCIT_C29853 \n", + "coronary artery http://purl.obolibrary.org/obo/NCIT_C12843 \n", + "sxt NaN \n", + "\n", + " definition \\\n", + "entity \n", + "silicon dioxide A natural compound of silicon and oxygen found... \n", + "coronary artery A principal artery that originates in the aort... \n", + "sxt NaN \n", + "\n", + " paper_frequency entity_type \n", + "entity \n", + "silicon dioxide 30 CHEMICAL \n", + "coronary artery 380 ORGAN \n", + "sxt 9 CHEMICAL " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "curated_occurrence_data = curation_app.get_curated_table()\n", "curated_occurrence_data.sample(3)" @@ -401,7 +949,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -419,9 +967,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['glucose', 'covid-19']" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "curation_app.get_terms_to_include()" ] @@ -449,7 +1008,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -458,7 +1017,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -471,11 +1030,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------------------\n", + "Factor: paragraph\n", + "-------------------------------\n", + "Done.\n", + "CPU times: user 2.22 s, sys: 566 ms, total: 2.78 s\n", + "Wall time: 8.46 s\n" + ] + } + ], "source": [ "%%time\n", "type_data = curated_occurrence_data[[\"entity_type\"]].rename(columns={\"entity_type\": \"type\"})\n", @@ -514,7 +1086,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -523,7 +1095,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -533,7 +1105,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -555,9 +1127,138 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading the file to '../data/Glucose_risk_3000_paper_meta_data.csv'\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleauthorsabstractdoiurljournalpmc_idpubmed_idpublish_time
id
3Surfactant protein-D and pulmonary host defenseCrouch, Erika CSurfactant protein-D (SP-D) participates in th...10.1186/rr19https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...Respir ResPMC5954911667972.02000-08-25
56CLINICAL VIGNETTESNaNNaN10.1046/j.1525-1497.18.s1.20.xhttps://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...J Gen Intern MedPMC149498812753119.02003-04-01
58Clinical VignettesNaNNaN10.1046/j.1525-1497.2001.0160s1023.xhttps://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...J Gen Intern MedPMC149531611357836.02001-04-01
\n", + "
" + ], + "text/plain": [ + " title authors \\\n", + "id \n", + "3 Surfactant protein-D and pulmonary host defense Crouch, Erika C \n", + "56 CLINICAL VIGNETTES NaN \n", + "58 Clinical Vignettes NaN \n", + "\n", + " abstract \\\n", + "id \n", + "3 Surfactant protein-D (SP-D) participates in th... \n", + "56 NaN \n", + "58 NaN \n", + "\n", + " doi \\\n", + "id \n", + "3 10.1186/rr19 \n", + "56 10.1046/j.1525-1497.18.s1.20.x \n", + "58 10.1046/j.1525-1497.2001.0160s1023.x \n", + "\n", + " url journal \\\n", + "id \n", + "3 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5... Respir Res \n", + "56 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1... J Gen Intern Med \n", + "58 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1... J Gen Intern Med \n", + "\n", + " pmc_id pubmed_id publish_time \n", + "id \n", + "3 PMC59549 11667972.0 2000-08-25 \n", + "56 PMC1494988 12753119.0 2003-04-01 \n", + "58 PMC1495316 11357836.0 2001-04-01 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "paper_medata = download_from_nexus(uri=f\"{nexus_endpoint}/resources/{nexus_bucket}/_/8fc1e60c-1ebe-4173-82c0-9775a4917041\",\n", " output_path = DATA_PATH, config_file_path=nexus_config_file, nexus_endpoint=nexus_endpoint, nexus_bucket=nexus_bucket)\n", @@ -575,7 +1276,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -595,7 +1296,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -642,7 +1343,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -667,11 +1368,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "visualization_app.run(port=8081, mode=\"inline\")" ] @@ -685,11 +1407,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ - "# visualization_app.run(port=8081, mode=\"inline\")" + "# visualization_app.run(port=8081, mode=\"external\")" ] } ], diff --git a/cord19kg/utils.py b/cord19kg/utils.py index 609be540..fcc503f3 100644 --- a/cord19kg/utils.py +++ b/cord19kg/utils.py @@ -23,12 +23,10 @@ import zipfile import pandas as pd -import networkx as nx from collections import Counter from kgforge.core import KnowledgeGraphForge -from networkx.readwrite.json_graph.cytoscape import cytoscape_data from bluegraph.core.io import PandasPGFrame from bluegraph.preprocess import CooccurrenceGenerator @@ -238,7 +236,7 @@ def mentions_to_occurrence(raw_data, aggregation_function = set occurence_data = raw_data.groupby(term_column).aggregate( - lambda x: aggregation_function(x)) + aggregation_function) if dump_prefix is not None: print("Saving the occurrence data....") @@ -259,13 +257,14 @@ def aggregate_cord_entities(x, factors): The rest of the input occurrence factors are aggregated as sets (e.g. sets of unique papers/sections/paragraphs). """ - result = { - "entity_type": list(x.entity_type) - } - for f in factors: - result[f] = set(x[f]) + if x.name == "entity_type": + return x.tolist() + else: + return set(x.tolist()) + # for f in factors: + # result[f] = set(x[f]) - return result + # return result def prepare_occurrence_data(mentions_df=None, diff --git a/requirements.txt b/requirements.txt index bcb5ee27..9d9a2735 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ pyjwt==1.7.1 tensorflow stellargraph>=1.2.0 graph-tool==2.37 +dash==1.19.0 diff --git a/tests/downstream/test_link_prediction.py b/tests/downstream/test_link_prediction.py index 5dae63eb..60b4fc3c 100644 --- a/tests/downstream/test_link_prediction.py +++ b/tests/downstream/test_link_prediction.py @@ -1,11 +1,9 @@ from sklearn.svm import LinearSVC import numpy as np -import pandas as pd from sklearn import model_selection -from bluegraph.downstream import (EmbeddingPipeline, - get_classification_scores) +from bluegraph.downstream import get_classification_scores from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder from bluegraph.downstream.link_prediction import (generate_negative_edges, EdgePredictor)