Skip to content

Commit

Permalink
Feature/dim reduction (#77)
Browse files Browse the repository at this point in the history
* Added dim reduction
* Test upd

Co-authored-by: Mohameth François SY <[email protected]>
  • Loading branch information
eugeniashurko and MFSY authored Jul 28, 2021
1 parent 838e9e0 commit 3bb2e75
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 19 deletions.
14 changes: 1 addition & 13 deletions bluegraph/downstream/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ def predict(self, pgframe, predict_elements=None):
return self.model.predict(data)



class Preprocessor(ABC):
"""Preprocessor inferface for EmbeddingPipeline."""

Expand Down Expand Up @@ -245,21 +244,10 @@ def get_index(self):
"""Get index of existing points."""
return self.similarity_processor.index

def generate_embedding_table(self):
"""Generate embedding table from similarity index."""
index = self.similarity_processor.index
pairs = [
(ind, self.similarity_processor._model.reconstruct(i))
for i, ind in enumerate(index)
]
return pd.DataFrame(
pairs, columns=["@id", "embedding"]).set_index("@id")


def retrieve_embeddings(self, indices):
"""Get embedding vectors for the input indices."""
if self.similarity_processor is None:
raise EmbeddingPipelineException(
raise EmbeddingPipeline.EmbeddingPipelineException(
"Similarity processor object is None, cannot "
"retrieve embedding vectors")
else:
Expand Down
92 changes: 88 additions & 4 deletions bluegraph/preprocess/encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@
import math
import numpy as np
import pandas as pd
from scipy.sparse import issparse

from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA

from .utils import (TfIdfEncoder,
Doc2VecEncoder,
Expand Down Expand Up @@ -95,7 +96,11 @@ def __init__(self, node_properties=None, edge_properties=None,
text_encoding_max_dimension=128,
missing_numeric="drop",
imputation_strategy="mean",
standardize_numeric=True):
standardize_numeric=True,
reduce_node_dims=False,
reduce_edge_dims=False,
n_node_components=None,
n_edge_components=None):
"""Initialize an encoder.
Parameters
Expand All @@ -120,6 +125,18 @@ def __init__(self, node_properties=None, edge_properties=None,
imputation_strategy : str, optional
standardize_numeric : str, optional
Flag indicating if numerical values should be standardized
reduce_node_dims : bool, optional.
Flag indicating if dimensionality reduction should be performed
on the node features. By default, False
reduce_edge_dims : bool, optional.
Flag indicating if dimensionality reduction should be performed
on the edge features. By defau.lt, False
n_node_components : int, optional
Number of principal components to include for dimensionality
reduction of the node features.
n_edge_components : int, optional
Number of principal components to include for dimensionality
reduction of the edges features.
"""
self.heterogeneous = heterogeneous
self.drop_types = drop_types
Expand All @@ -131,6 +148,10 @@ def __init__(self, node_properties=None, edge_properties=None,
self.missing_numeric = missing_numeric
self.imputation_strategy = imputation_strategy
self.standardize_numeric = standardize_numeric
self.reduce_node_dims = reduce_node_dims
self.reduce_edge_dims = reduce_edge_dims
self.n_node_components = n_node_components
self.n_edge_components = n_edge_components

self._node_encoders = {}
if node_properties is not None:
Expand Down Expand Up @@ -163,6 +184,26 @@ def __init__(self, node_properties=None, edge_properties=None,
for p in edge_properties:
self._edge_encoders[p] = None

self.node_reducer = None
if self.reduce_node_dims is True:
if self.n_node_components is None:
raise SemanticPGEncoder.EncodingException(
"Dimensionality reduction of node features is enabled, "
"number of dimensions ('n_node_components' must be "
"specified)")
else:
self.node_reducer = PCA(n_components=self.n_node_components)

self.edge_reducer = None
if self.reduce_edge_dims is True:
if self.n_edge_components is None:
raise SemanticPGEncoder.EncodingException(
"Dimensionality reduction of edge features is enabled, "
"number of dimensions ('n_edge_components' must be "
"specified)")
else:
self.edge_reducer = PCA(n_components=self.n_edge_components)

def info(self):
if self.heterogeneous:
node_properties = {}
Expand Down Expand Up @@ -191,7 +232,11 @@ def info(self):
"imputation_strategy": self.imputation_strategy,
"standardize_numeric": self.standardize_numeric,
"node_properties": node_properties,
"edge_properties": edge_properties
"edge_properties": edge_properties,
"reduce_node_dims": self.reduce_node_dims,
"reduce_edge_dims": self.reduce_edge_dims,
"n_node_components": self.n_node_components,
"n_edge_components": self.n_edge_components,
}
return info

Expand Down Expand Up @@ -231,7 +276,23 @@ def fit(self, pgframe):
pgframe.edges(raw_frame=True), prop,
_get_encoder_type(pgframe, prop, is_edge=True))

def transform(self, pgframe):
transformed_frame = None
if self.node_reducer is not None:
transformed_frame = self.transform(pgframe, skip_reduction=True)
X = np.array(
transformed_frame.get_node_property_values("features").apply(
lambda x: x.tolist() if not isinstance(x, list) else x).tolist())
self.node_reducer.fit(X)

if self.edge_reducer is not None and self.edge_features:
if transformed_frame is None:
transformed_frame = self.transform(pgframe, skip_reduction=True)
X = np.array(transformed_frame.get_edge_property_values(
"features").apply(
lambda x: x.tolist() if not isinstance(x, list) else x).tolist())
self.edge_reducer.fit(X)

def transform(self, pgframe, skip_reduction=False):
"""Transform the input PGFrame."""
transformed_pgframe = self._create_pgframe(
nodes=pgframe.nodes(), edges=pgframe.edges())
Expand Down Expand Up @@ -304,6 +365,27 @@ def _aggregate_encoding_of(frame, encoder, aggregation_handler,
transformed_pgframe.aggregate_edge_properties(
self._concatenate_features, into="features")

if not skip_reduction:
if self.node_reducer is not None:
X = np.array(transformed_pgframe.get_node_property_values(
"features").apply(
lambda x: x.tolist() if not isinstance(x, list) else x).tolist())
reduced_X = self.node_reducer.transform(X)
df = pd.DataFrame(
columns=["features"],
index=transformed_pgframe.nodes())
df["features"] = reduced_X.tolist()
transformed_pgframe.add_node_properties(df)

if self.edge_reducer is not None and self.edge_features:
X = np.array(transformed_pgframe.get_edge_property_values(
"features").apply(
lambda x: x.tolist() if not isinstance(x, list) else x).tolist())
reduced_X = self.edge_reducer.transform(X)
# Here, we need to make it generic and not Pandas-dependent
transformed_pgframe._edges["features"] =\
reduced_X.tolist()

return transformed_pgframe

def fit_transform(self, pgframe):
Expand Down Expand Up @@ -373,6 +455,8 @@ def _apply_encoder(self, frame, prop, encoder, encoder_type="category"):
vectors = encoder.transform(
np.reshape(frame[prop].to_list(), (frame[prop].shape[0], 1)))
if vectors is not None:
if issparse(vectors):
vectors = vectors.todense()
df = pd.DataFrame(columns=[prop], index=frame.index)
df[prop] = vectors.tolist()
return df
Expand Down
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,8 @@ def random_text_corpus():
)
return corpus


@pytest.fixture(scope="session")
def random_words():
corpus = list(random.sample(words.words(), 700))
corpus = list(set(random.sample(words.words(), 700)))
return corpus
19 changes: 18 additions & 1 deletion tests/preprocess/test_semantic_pg_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,24 @@ def test_encoding(random_pgframe):
text_encoding="tfidf",
standardize_numeric=True)
transformed_frame = hom_encoder.fit_transform(random_pgframe)

# Test dimensionality reduction
hom_encoder = ScikitLearnPGEncoder(
node_properties=["weight", "color", "desc"],
edge_properties=["mi", "distance", "shapes", "desc"],
edge_features=True,
heterogeneous=False,
encode_types=True,
drop_types=True,
text_encoding="word2vec",
standardize_numeric=True,
reduce_node_dims=True,
n_node_components=32,
reduce_edge_dims=True,
n_edge_components=32)
transformed_frame = hom_encoder.fit_transform(random_pgframe)

het_encoder = ScikitLearnPGEncoder(
node_properties={
"Apple": ["weight", "color", "desc"],
"Orange": ["color", "desc"],
Expand All @@ -106,4 +123,4 @@ def test_encoding(random_pgframe):
drop_types=True,
text_encoding="word2vec",
standardize_numeric=True)
transformed_frame = hom_encoder.fit_transform(random_pgframe)
transformed_frame = het_encoder.fit_transform(random_pgframe)

0 comments on commit 3bb2e75

Please sign in to comment.