Hotfix/missing dependency (colab fails) (#82)

* Resolved unnecessary dependency of preprocessing on downstream * Added python_louvain fix * Moved 'from_ontology' to the abstract PGFrame * Bugfix in pandas aggregate (changed behaviour for 1.3.X)
BlueBrain · Jul 30, 2021 · 700fb64 · 700fb64
1 parent 3bb2e75
commit 700fb64
Show file tree

Hide file tree

Showing 13 changed files with 922 additions and 204 deletions.
diff --git a/bluegraph/backends/networkx/analyse/communities.py b/bluegraph/backends/networkx/analyse/communities.py
@@ -18,7 +18,7 @@
 from operator import itemgetter
 from functools import partial
 
-import community as community_louvain
+import community.community_louvain as community_louvain
 from networkx.algorithms.community.centrality import girvan_newman
 from networkx.algorithms.community.label_propagation import asyn_lpa_communities
 from networkx.algorithms.community.quality import (performance,

diff --git a/bluegraph/core/embed/embedders.py b/bluegraph/core/embed/embedders.py
@@ -25,11 +25,29 @@
 
 from bluegraph.exceptions import BlueGraphException, BlueGraphWarning
 
-
 DEFAULT_EMBEDDING_DIMENSION = 64
 
 
-class GraphElementEmbedder(ABC):
+class Embedder(ABC):
+    """Embedder inferface for EmbeddingPipeline."""
+
+    @abstractmethod
+    def info(self):
+        """Get dictionary with the info."""
+        pass
+
+    @abstractmethod
+    def fit_model(self, data, **kwargs):
+        """Train specified model on the provided data."""
+        pass
+
+    @abstractmethod
+    def predict_embeddings(self, data=None, **kwargs):
+        """Predict embeddings of out-sample elements."""
+        pass
+
+
+class GraphElementEmbedder(Embedder):
     """Abstract class for a node/edge embedder."""
 
     @property
@@ -80,7 +98,7 @@ def __init__(self, model_name, directed=True, include_type=False,
         """Initialize StellarGraphEmbedder."""
         if model_name.lower() not in self._transductive_models and\
            model_name.lower() not in self._inductive_models:
-            raise ElementEmbedder.InvalidModelException(
+            raise GraphElementEmbedder.InvalidModelException(
                 f"Embedding model '{model_name.lower()}' is not implemented "
                 f"for {self.__class__.__name__}")
 
@@ -164,7 +182,7 @@ def predict_embeddings(self, pgframe, nodes=None):
         if nodes is None:
             nodes = pgframe.nodes()
         if self._embedding_model is None:
-            raise ElementEmbedder.PredictionException(
+            raise GraphElementEmbedder.PredictionException(
                 "Embedder does not have a predictive model")
 
         input_graph = self._generate_graph(pgframe, self.graph_configs)
@@ -241,6 +259,6 @@ class PredictionException(BlueGraphException):
         pass
 
 
-class GraphEmbedder(ABC):
+class GraphEmbedder(Embedder):
     """Abstract class for a graph embedder."""
     pass
diff --git a/bluegraph/core/io.py b/bluegraph/core/io.py
@@ -463,12 +463,13 @@ def from_jsonld(self, resources, include_context=True, type_handler=None,
         resources : iterable of dict
             Collection of input resources in JSON-LD format
         include_context : bool, optional
-            Flag indicating if the context should be included as a property. Default is True.
+            Flag indicating if the context should be included as a property.
+            Default is True.
         type_handler : func, optional
             Function to apply to the value of type (e.g. '@type')
         types_from_relations : bool, optional
-            Flag indicating if resources with unkown types should be assigned with types
-            from the incoming relations. Default is True
+            Flag indicating if resources with unkown types should be assigned
+            with types from the incoming relations. Default is True
         exclude : list of str, optional
             Collection of property names to exclude. Default is empty.
         only_props : bool, optional
@@ -613,6 +614,97 @@ def density(self, directed=True):
 
         return self.number_of_edges() / total_edges
 
+    @classmethod
+    def from_ontology(cls, filepath=None, rdf_graph=None, format="turtle",
+                      remove_prop_uris=False):
+        """Create a PandasPGFrame from ontology."""
+        if filepath is None and rdf_graph is None:
+            raise ValueError(
+                "Ontology source must be specified: both "
+                "'filepath' or 'rdf_graph' are None")
+
+        if rdf_graph is None:
+            g = rdflib.Graph()
+            g.parse(filepath, format=format)
+        else:
+            g = rdf_graph
+
+        classes = set()
+        individuals = set()
+
+        # Extract classes and individuals as nodes
+        for s in g.subjects(RDF.type, OWL.Class):
+            if g.label(s):
+                classes.add(s)
+        for s in g.subjects(RDF.type, OWL.NamedIndividual):
+            if g.label(s):
+                individuals.add(s)
+
+        edges = defaultdict(set)
+        props = defaultdict(dict)
+        for c in classes:
+            node_id = g.label(c).value
+            for p, o in g.predicate_objects(c):
+                if isinstance(o, rdflib.Literal):
+                    prop_name = None
+                    if g.label(p):
+                        prop_name = g.label(p).value
+                    else:
+                        prop_name = str(p)
+                    if node_id in props[prop_name]:
+                        if isinstance(props[prop_name][node_id], list):
+                            props[prop_name][node_id].append(o.value)
+                        else:
+                            props[prop_name][node_id] = [
+                                props[prop_name][node_id], o.value
+                            ]
+                    else:
+                        props[prop_name][node_id] = o.value
+                else:
+                    source_node = g.label(c).value
+                    if p.eq(RDFS.subClassOf):
+                        if isinstance(o, rdflib.BNode):
+                            target_node = None
+                            for oo in g.objects(o, OWL.someValuesFrom):
+                                if g.label(oo):
+                                    target_node = g.label(oo).value
+                            for oo in g.objects(o, OWL.onProperty):
+                                if g.label(oo):
+                                    edge_label = g.label(oo).value
+                            if target_node:
+                                edges[(source_node, target_node)].add(edge_label)
+                        else:
+                            if g.label(o):
+                                edges[(source_node, g.label(o).value)].add(
+                                    "IS_SUBCLASS_OF")
+                    elif not p.eq(RDF.type):
+                        target_node = g.label(o)
+                        if target_node:
+                            edges[(source_node, target_node.value)].add(str(p))
+
+        # Create a PGFrame
+        graph = cls(
+            nodes=[g.label(el).value for el in classes.union(individuals)],
+            edges=list(edges.keys()))
+        graph.add_edge_types(edges)
+
+        # Add node props
+        for k, v in props.items():
+            graph.add_node_properties(
+                pd.DataFrame(v.items(), columns=["@id", k])
+            )
+
+        # Remove uri's from property names: 'http://...#prop' becomes 'prop'
+        if remove_prop_uris:
+            mapping = {}
+            for p in graph.node_properties():
+                match = re.match(r"(http:\/\/.*)#(.*)", p)
+                if match:
+                    mapping[p] = match.groups()[1]
+            graph.rename_node_properties(mapping)
+
+        return graph
+
     class PGFrameException(BlueGraphException):
         pass
 
@@ -1239,97 +1331,6 @@ def rename_edge_properties(self, mapping):
                 del self._edge_prop_types[k]
         self._edge_prop_types.update(new_typing)
 
-    @classmethod
-    def from_ontology(cls, filepath=None, rdf_graph=None, format="turtle",
-                      remove_prop_uris=False):
-        """Create a PandasPGFrame from ontology."""
-        if filepath is None and rdf_graph is None:
-            raise ValueError(
-                "Ontology source must be specified: both "
-                "'filepath' or 'rdf_graph' are None")
-
-        if rdf_graph is None:
-            g = rdflib.Graph()
-            g.parse(filepath, format=format)
-        else:
-            g = rdf_graph
-
-        classes = set()
-        individuals = set()
-
-        # Extract classes and individuals as nodes
-        for s in g.subjects(RDF.type, OWL.Class):
-            if g.label(s):
-                classes.add(s)
-        for s in g.subjects(RDF.type, OWL.NamedIndividual):
-            if g.label(s):
-                individuals.add(s)
-
-        edges = defaultdict(set)
-        props = defaultdict(dict)
-        for c in classes:
-            node_id = g.label(c).value
-            for p, o in g.predicate_objects(c):
-                if isinstance(o, rdflib.Literal):
-                    prop_name = None
-                    if g.label(p):
-                        prop_name = g.label(p).value
-                    else:
-                        prop_name = str(p)
-                    if node_id in props[prop_name]:
-                        if isinstance(props[prop_name][node_id], list):
-                            props[prop_name][node_id].append(o.value)
-                        else:
-                            props[prop_name][node_id] = [
-                                props[prop_name][node_id], o.value
-                            ]
-                    else:
-                        props[prop_name][node_id] = o.value
-                else:
-                    source_node = g.label(c).value
-                    if p.eq(RDFS.subClassOf):
-                        if isinstance(o, rdflib.BNode):
-                            target_node = None
-                            for oo in g.objects(o, OWL.someValuesFrom):
-                                if g.label(oo):
-                                    target_node = g.label(oo).value
-                            for oo in g.objects(o, OWL.onProperty):
-                                if g.label(oo):
-                                    edge_label = g.label(oo).value
-                            if target_node:
-                                edges[(source_node, target_node)].add(edge_label)
-                        else:
-                            if g.label(o):
-                                edges[(source_node, g.label(o).value)].add(
-                                    "IS_SUBCLASS_OF")
-                    elif not p.eq(RDF.type):
-                        target_node = g.label(o)
-                        if target_node:
-                            edges[(source_node, target_node.value)].add(str(p))
-
-        # Create a PGFrame
-        graph = cls(
-            nodes=[g.label(el).value for el in classes.union(individuals)],
-            edges=list(edges.keys()))
-        graph.add_edge_types(edges)
-
-        # Add node props
-        for k, v in props.items():
-            graph.add_node_properties(
-                pd.DataFrame(v.items(), columns=["@id", k])
-            )
-
-        # Remove uri's from property names: 'http://...#prop' becomes 'prop'
-        if remove_prop_uris:
-            mapping = {}
-            for p in graph.node_properties():
-                match = re.match(r"(http:\/\/.*)#(.*)", p)
-                if match:
-                    mapping[p] = match.groups()[1]
-            graph.rename_node_properties(mapping)
-
-        return graph
-
 
 class SparkPGFrame(PGFrame):
     """Class for storing typed PGs as a collection of Spark DataFrames."""

diff --git a/bluegraph/core/utils.py b/bluegraph/core/utils.py
@@ -13,6 +13,8 @@
 #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
+from abc import ABC, abstractmethod
+
 import math
 import pandas as pd
 
@@ -74,3 +76,22 @@ def top_n(data_dict, n, smallest=False):
     else:
         df = df.nlargest(n, columns=["value"])
     return(list(df["id"]))
+
+
+class Preprocessor(ABC):
+    """Preprocessor inferface for EmbeddingPipeline."""
+
+    @abstractmethod
+    def info(self):
+        """Get dictionary with the info."""
+        pass
+
+    @abstractmethod
+    def fit(self, data, **kwargs):
+        """Fit the preprocessor."""
+        pass
+
+    @abstractmethod
+    def transform(self, data, **kwargs):
+        """Tranform the input data."""
+        pass
diff --git a/bluegraph/downstream/__init__.py b/bluegraph/downstream/__init__.py
@@ -14,7 +14,5 @@
 #    See the License for the specific language governing permissions and
 #    limitations under the License.
 from .data_structures import (ElementClassifier,
-                              Preprocessor,
-                              Embedder,
                               EmbeddingPipeline)
 from .utils import *
diff --git a/bluegraph/downstream/data_structures.py b/bluegraph/downstream/data_structures.py
@@ -22,7 +22,6 @@
 import re
 import pickle
 import shutil
-import warnings
 
 from bluegraph.exceptions import BlueGraphException, BlueGraphWarning
 from .similarity import SimilarityProcessor
@@ -93,44 +92,6 @@ def predict(self, pgframe, predict_elements=None):
         return self.model.predict(data)
 
 
-class Preprocessor(ABC):
-    """Preprocessor inferface for EmbeddingPipeline."""
-
-    @abstractmethod
-    def info(self):
-        """Get dictionary with the info."""
-        pass
-
-    @abstractmethod
-    def fit(self, data, **kwargs):
-        """Fit the preprocessor."""
-        pass
-
-    @abstractmethod
-    def transform(self, data, **kwargs):
-        """Tranform the input data."""
-        pass
-
-
-class Embedder(ABC):
-    """Embedder inferface for EmbeddingPipeline."""
-
-    @abstractmethod
-    def info(self):
-        """Get dictionary with the info."""
-        pass
-
-    @abstractmethod
-    def fit_model(self, data, **kwargs):
-        """Train specified model on the provided data."""
-        pass
-
-    @abstractmethod
-    def predict_embeddings(self, data=None, **kwargs):
-        """Predict embeddings of out-sample elements."""
-        pass
-
-
 class EmbeddingPipeline(object):
     """Data structure for stacking embedding pipelines.
 

diff --git a/bluegraph/preprocess/encoders.py b/bluegraph/preprocess/encoders.py
@@ -30,12 +30,12 @@
                     Doc2VecEncoder,
                     _get_encoder_type,
                     _generate_type_repr)
-from bluegraph.core.utils import normalize_to_set
+from bluegraph.core.utils import normalize_to_set, Preprocessor
 from bluegraph.core.io import PandasPGFrame
 from bluegraph.exceptions import BlueGraphException
 
 
-class SemanticPGEncoder(ABC):
+class SemanticPGEncoder(Preprocessor):
     """Abstract class for semantic property graph encoder.
 
     The encoder provides a wrapper for multiple heterogeneous

diff --git a/bluegraph/preprocess/utils.py b/bluegraph/preprocess/utils.py
@@ -26,7 +26,7 @@
 from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 from sklearn.feature_extraction.text import TfidfVectorizer
 
-from bluegraph.downstream.data_structures import Preprocessor
+from bluegraph.core.utils import Preprocessor
 
 
 def _get_encoder_type(pgframe, prop, is_edge=False):