Skip to content

Commit

Permalink
Hotfix/missing dependency (colab fails) (#82)
Browse files Browse the repository at this point in the history
* Resolved unnecessary dependency of preprocessing on downstream

* Added python_louvain fix

* Moved 'from_ontology' to the abstract PGFrame

* Bugfix in pandas aggregate (changed behaviour for 1.3.X)
  • Loading branch information
eugeniashurko authored Jul 30, 2021
1 parent 3bb2e75 commit 700fb64
Show file tree
Hide file tree
Showing 13 changed files with 922 additions and 204 deletions.
2 changes: 1 addition & 1 deletion bluegraph/backends/networkx/analyse/communities.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from operator import itemgetter
from functools import partial

import community as community_louvain
import community.community_louvain as community_louvain
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community.label_propagation import asyn_lpa_communities
from networkx.algorithms.community.quality import (performance,
Expand Down
28 changes: 23 additions & 5 deletions bluegraph/core/embed/embedders.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,29 @@

from bluegraph.exceptions import BlueGraphException, BlueGraphWarning


DEFAULT_EMBEDDING_DIMENSION = 64


class GraphElementEmbedder(ABC):
class Embedder(ABC):
"""Embedder inferface for EmbeddingPipeline."""

@abstractmethod
def info(self):
"""Get dictionary with the info."""
pass

@abstractmethod
def fit_model(self, data, **kwargs):
"""Train specified model on the provided data."""
pass

@abstractmethod
def predict_embeddings(self, data=None, **kwargs):
"""Predict embeddings of out-sample elements."""
pass


class GraphElementEmbedder(Embedder):
"""Abstract class for a node/edge embedder."""

@property
Expand Down Expand Up @@ -80,7 +98,7 @@ def __init__(self, model_name, directed=True, include_type=False,
"""Initialize StellarGraphEmbedder."""
if model_name.lower() not in self._transductive_models and\
model_name.lower() not in self._inductive_models:
raise ElementEmbedder.InvalidModelException(
raise GraphElementEmbedder.InvalidModelException(
f"Embedding model '{model_name.lower()}' is not implemented "
f"for {self.__class__.__name__}")

Expand Down Expand Up @@ -164,7 +182,7 @@ def predict_embeddings(self, pgframe, nodes=None):
if nodes is None:
nodes = pgframe.nodes()
if self._embedding_model is None:
raise ElementEmbedder.PredictionException(
raise GraphElementEmbedder.PredictionException(
"Embedder does not have a predictive model")

input_graph = self._generate_graph(pgframe, self.graph_configs)
Expand Down Expand Up @@ -241,6 +259,6 @@ class PredictionException(BlueGraphException):
pass


class GraphEmbedder(ABC):
class GraphEmbedder(Embedder):
"""Abstract class for a graph embedder."""
pass
189 changes: 95 additions & 94 deletions bluegraph/core/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,12 +463,13 @@ def from_jsonld(self, resources, include_context=True, type_handler=None,
resources : iterable of dict
Collection of input resources in JSON-LD format
include_context : bool, optional
Flag indicating if the context should be included as a property. Default is True.
Flag indicating if the context should be included as a property.
Default is True.
type_handler : func, optional
Function to apply to the value of type (e.g. '@type')
types_from_relations : bool, optional
Flag indicating if resources with unkown types should be assigned with types
from the incoming relations. Default is True
Flag indicating if resources with unkown types should be assigned
with types from the incoming relations. Default is True
exclude : list of str, optional
Collection of property names to exclude. Default is empty.
only_props : bool, optional
Expand Down Expand Up @@ -613,6 +614,97 @@ def density(self, directed=True):

return self.number_of_edges() / total_edges

@classmethod
def from_ontology(cls, filepath=None, rdf_graph=None, format="turtle",
remove_prop_uris=False):
"""Create a PandasPGFrame from ontology."""
if filepath is None and rdf_graph is None:
raise ValueError(
"Ontology source must be specified: both "
"'filepath' or 'rdf_graph' are None")

if rdf_graph is None:
g = rdflib.Graph()
g.parse(filepath, format=format)
else:
g = rdf_graph

classes = set()
individuals = set()

# Extract classes and individuals as nodes
for s in g.subjects(RDF.type, OWL.Class):
if g.label(s):
classes.add(s)
for s in g.subjects(RDF.type, OWL.NamedIndividual):
if g.label(s):
individuals.add(s)

edges = defaultdict(set)
props = defaultdict(dict)
for c in classes:
node_id = g.label(c).value
for p, o in g.predicate_objects(c):
if isinstance(o, rdflib.Literal):
prop_name = None
if g.label(p):
prop_name = g.label(p).value
else:
prop_name = str(p)
if node_id in props[prop_name]:
if isinstance(props[prop_name][node_id], list):
props[prop_name][node_id].append(o.value)
else:
props[prop_name][node_id] = [
props[prop_name][node_id], o.value
]
else:
props[prop_name][node_id] = o.value
else:
source_node = g.label(c).value
if p.eq(RDFS.subClassOf):
if isinstance(o, rdflib.BNode):
target_node = None
for oo in g.objects(o, OWL.someValuesFrom):
if g.label(oo):
target_node = g.label(oo).value
for oo in g.objects(o, OWL.onProperty):
if g.label(oo):
edge_label = g.label(oo).value
if target_node:
edges[(source_node, target_node)].add(edge_label)
else:
if g.label(o):
edges[(source_node, g.label(o).value)].add(
"IS_SUBCLASS_OF")
elif not p.eq(RDF.type):
target_node = g.label(o)
if target_node:
edges[(source_node, target_node.value)].add(str(p))

# Create a PGFrame
graph = cls(
nodes=[g.label(el).value for el in classes.union(individuals)],
edges=list(edges.keys()))
graph.add_edge_types(edges)

# Add node props
for k, v in props.items():
graph.add_node_properties(
pd.DataFrame(v.items(), columns=["@id", k])
)

# Remove uri's from property names: 'http://...#prop' becomes 'prop'
if remove_prop_uris:
mapping = {}
for p in graph.node_properties():
match = re.match(r"(http:\/\/.*)#(.*)", p)
if match:
mapping[p] = match.groups()[1]
graph.rename_node_properties(mapping)

return graph

class PGFrameException(BlueGraphException):
pass

Expand Down Expand Up @@ -1239,97 +1331,6 @@ def rename_edge_properties(self, mapping):
del self._edge_prop_types[k]
self._edge_prop_types.update(new_typing)

@classmethod
def from_ontology(cls, filepath=None, rdf_graph=None, format="turtle",
remove_prop_uris=False):
"""Create a PandasPGFrame from ontology."""
if filepath is None and rdf_graph is None:
raise ValueError(
"Ontology source must be specified: both "
"'filepath' or 'rdf_graph' are None")

if rdf_graph is None:
g = rdflib.Graph()
g.parse(filepath, format=format)
else:
g = rdf_graph

classes = set()
individuals = set()

# Extract classes and individuals as nodes
for s in g.subjects(RDF.type, OWL.Class):
if g.label(s):
classes.add(s)
for s in g.subjects(RDF.type, OWL.NamedIndividual):
if g.label(s):
individuals.add(s)

edges = defaultdict(set)
props = defaultdict(dict)
for c in classes:
node_id = g.label(c).value
for p, o in g.predicate_objects(c):
if isinstance(o, rdflib.Literal):
prop_name = None
if g.label(p):
prop_name = g.label(p).value
else:
prop_name = str(p)
if node_id in props[prop_name]:
if isinstance(props[prop_name][node_id], list):
props[prop_name][node_id].append(o.value)
else:
props[prop_name][node_id] = [
props[prop_name][node_id], o.value
]
else:
props[prop_name][node_id] = o.value
else:
source_node = g.label(c).value
if p.eq(RDFS.subClassOf):
if isinstance(o, rdflib.BNode):
target_node = None
for oo in g.objects(o, OWL.someValuesFrom):
if g.label(oo):
target_node = g.label(oo).value
for oo in g.objects(o, OWL.onProperty):
if g.label(oo):
edge_label = g.label(oo).value
if target_node:
edges[(source_node, target_node)].add(edge_label)
else:
if g.label(o):
edges[(source_node, g.label(o).value)].add(
"IS_SUBCLASS_OF")
elif not p.eq(RDF.type):
target_node = g.label(o)
if target_node:
edges[(source_node, target_node.value)].add(str(p))

# Create a PGFrame
graph = cls(
nodes=[g.label(el).value for el in classes.union(individuals)],
edges=list(edges.keys()))
graph.add_edge_types(edges)

# Add node props
for k, v in props.items():
graph.add_node_properties(
pd.DataFrame(v.items(), columns=["@id", k])
)

# Remove uri's from property names: 'http://...#prop' becomes 'prop'
if remove_prop_uris:
mapping = {}
for p in graph.node_properties():
match = re.match(r"(http:\/\/.*)#(.*)", p)
if match:
mapping[p] = match.groups()[1]
graph.rename_node_properties(mapping)

return graph


class SparkPGFrame(PGFrame):
"""Class for storing typed PGs as a collection of Spark DataFrames."""
Expand Down
21 changes: 21 additions & 0 deletions bluegraph/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABC, abstractmethod

import math
import pandas as pd

Expand Down Expand Up @@ -74,3 +76,22 @@ def top_n(data_dict, n, smallest=False):
else:
df = df.nlargest(n, columns=["value"])
return(list(df["id"]))


class Preprocessor(ABC):
"""Preprocessor inferface for EmbeddingPipeline."""

@abstractmethod
def info(self):
"""Get dictionary with the info."""
pass

@abstractmethod
def fit(self, data, **kwargs):
"""Fit the preprocessor."""
pass

@abstractmethod
def transform(self, data, **kwargs):
"""Tranform the input data."""
pass
2 changes: 0 additions & 2 deletions bluegraph/downstream/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .data_structures import (ElementClassifier,
Preprocessor,
Embedder,
EmbeddingPipeline)
from .utils import *
39 changes: 0 additions & 39 deletions bluegraph/downstream/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import re
import pickle
import shutil
import warnings

from bluegraph.exceptions import BlueGraphException, BlueGraphWarning
from .similarity import SimilarityProcessor
Expand Down Expand Up @@ -93,44 +92,6 @@ def predict(self, pgframe, predict_elements=None):
return self.model.predict(data)


class Preprocessor(ABC):
"""Preprocessor inferface for EmbeddingPipeline."""

@abstractmethod
def info(self):
"""Get dictionary with the info."""
pass

@abstractmethod
def fit(self, data, **kwargs):
"""Fit the preprocessor."""
pass

@abstractmethod
def transform(self, data, **kwargs):
"""Tranform the input data."""
pass


class Embedder(ABC):
"""Embedder inferface for EmbeddingPipeline."""

@abstractmethod
def info(self):
"""Get dictionary with the info."""
pass

@abstractmethod
def fit_model(self, data, **kwargs):
"""Train specified model on the provided data."""
pass

@abstractmethod
def predict_embeddings(self, data=None, **kwargs):
"""Predict embeddings of out-sample elements."""
pass


class EmbeddingPipeline(object):
"""Data structure for stacking embedding pipelines.
Expand Down
4 changes: 2 additions & 2 deletions bluegraph/preprocess/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@
Doc2VecEncoder,
_get_encoder_type,
_generate_type_repr)
from bluegraph.core.utils import normalize_to_set
from bluegraph.core.utils import normalize_to_set, Preprocessor
from bluegraph.core.io import PandasPGFrame
from bluegraph.exceptions import BlueGraphException


class SemanticPGEncoder(ABC):
class SemanticPGEncoder(Preprocessor):
"""Abstract class for semantic property graph encoder.
The encoder provides a wrapper for multiple heterogeneous
Expand Down
2 changes: 1 addition & 1 deletion bluegraph/preprocess/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer

from bluegraph.downstream.data_structures import Preprocessor
from bluegraph.core.utils import Preprocessor


def _get_encoder_type(pgframe, prop, is_edge=False):
Expand Down
Loading

0 comments on commit 700fb64

Please sign in to comment.