Skip to content

Commit

Permalink
Add network based features.
Browse files Browse the repository at this point in the history
  • Loading branch information
bkolosk1 committed Jan 20, 2025
1 parent 55d0716 commit 9aeb757
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 4 deletions.
155 changes: 155 additions & 0 deletions autoBOTLib/features/features_word_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import logging
import random
import networkx as nx
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from tqdm import tqdm

logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S")
logging.getLogger().setLevel(logging.INFO)


class WordGraph:
def __init__(self, fast=True, verbose=True, window_size=2, sample_ratio=0.1, repeats=10):
"""
Class initialization method.
:param fast: Perform Monte-Carlo style feature estimation
:param window_size: what is the size for the word co-occurance graphs
:param sample_ratio: the percentage of nodes to sample for MCMS estimation
:param repeats:
"""
self.verbose = verbose
self.fast = fast
self.window_size = window_size
self.sample_ratio = sample_ratio
self.repeats = repeats if fast else 1
self.features = None

def fit(self, text_list):
pass

def transform(self, new_documents):
if not isinstance(new_documents, list):
new_documents = new_documents.values.tolist()
if self.verbose:
logging.info("[Network Features] Transforming new documents.")
data = []
for text in tqdm(new_documents):
tokens = self.preprocess_text(text)
G = self.build_cooccurrence_graph(tokens, self.window_size)
if len(list(G.nodes)) < 2:
data.append(self.empty_features())
continue
repeated_metrics = [
self.compute_fast_metrics(self.sample_subgraph(G, self.sample_ratio))
for _ in range(self.repeats)
]
if self.repeats > 1:
aggregated = {}
for key in repeated_metrics[0]:
values = [metrics[key] for metrics in repeated_metrics]
aggregated[f"{key}_avg"] = np.mean(values)
aggregated[f"{key}_std"] = np.std(values)
aggregated[f"{key}_min"] = np.min(values)
aggregated[f"{key}_max"] = np.max(values)
data.append(list(aggregated.values()))
self.features = list(aggregated.keys())
else:
metrics = repeated_metrics[0]
data.append(list(metrics.values()))
self.features = list(metrics.keys())

return np.array(data)

def fit_transform(self, documents, b=None):
self.fit(documents)
return self.transform(documents)

def get_feature_names_out(self):
return self.features

def build_cooccurrence_graph(self, tokens, window_size):
G = nx.Graph()
for i, word in enumerate(tokens):
for j in range(i + 1, min(i + window_size, len(tokens))):
G.add_edge(word, tokens[j])
return G

def preprocess_text(self, text):
tokens = word_tokenize(text.lower())
stemmer = PorterStemmer()
return [stemmer.stem(word) for word in tokens if word.isalnum()]

def sample_subgraph(self, G, sample_ratio):
nodes = list(G.nodes)
sample_size = int(sample_ratio * len(nodes))
sample_size = max(1, min(sample_size, len(nodes)))
sampled_nodes = random.sample(nodes, sample_size)
return G.subgraph(sampled_nodes)

def compute_fast_metrics(self, G):
num_nodes = len(G.nodes)
num_edges = len(G.edges)
density = nx.density(G)
avg_degree = sum(dict(G.degree()).values()) / num_nodes if num_nodes > 0 else 0
avg_clustering = nx.average_clustering(G) if num_nodes > 0 else 0

if num_nodes > 0:
betweenness = nx.betweenness_centrality(G, normalized=True)
avg_betweenness = np.mean(list(betweenness.values()))
else:
avg_betweenness = 0

if num_nodes > 0:
largest_cc = max(nx.connected_components(G), key=len)
subG = G.subgraph(largest_cc)
closeness = nx.closeness_centrality(subG)
avg_closeness = np.mean(list(closeness.values()))
else:
avg_closeness = 0

num_components = nx.number_connected_components(G)

if num_nodes > 0:
largest_cc_size = max(len(comp) for comp in nx.connected_components(G))
else:
largest_cc_size = 0

if num_nodes > 0:
pagerank = nx.pagerank(G)
avg_pagerank = np.mean(list(pagerank.values()))
max_pagerank = np.max(list(pagerank.values()))
else:
avg_pagerank = 0
max_pagerank = 0

return {
"num_nodes": num_nodes,
"num_edges": num_edges,
"density": density,
"avg_degree": avg_degree,
"avg_clustering": avg_clustering,
"avg_betweenness": avg_betweenness,
"avg_closeness": avg_closeness,
"num_components": num_components,
"largest_cc_size": largest_cc_size,
"avg_pagerank": avg_pagerank,
"max_pagerank": max_pagerank,
}

def empty_features(self):
return [0] * 11 * (4 if self.fast else 1)


if __name__ == "__main__":
df = pd.read_csv("../../data/insults/train.tsv", sep="\t")
example_text = df["text_a"]
labels = df["label"].tolist()
clx = WordGraph(fast=True, window_size=2, sample_ratio=0.1, repeats=10)
sim_features = clx.fit_transform(example_text)
print(clx.get_feature_names_out())
print(sim_features.shape)
22 changes: 18 additions & 4 deletions autoBOTLib/optimization/optimization_feature_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from autoBOTLib.features.features_token_relations import *
from autoBOTLib.features.features_contextual import *
from autoBOTLib.features.features_images import *
from autoBOTLib.features.features_word_graph import *

import string
import re
Expand Down Expand Up @@ -61,15 +62,17 @@ def PerceptronTagger():
'concept_features', 'document_graph', 'relational_features_token',
'topic_features', 'keyword_features', 'relational_features_char',
'char_features', 'word_features', 'relational_features_bigram',
'contextual_features'
'contextual_features',
'word_graph'
]

# This one is ~language agnostic
feature_presets['neurosymbolic-lite'] = [
'document_graph', 'neural_features_dbow', 'neural_features_dm',
'topic_features', 'keyword_features', 'relational_features_char',
'relational_features_token', 'char_features', 'word_features',
'relational_features_bigram', 'concept_features'
'relational_features_bigram', 'concept_features',
'word_graph'
]

# MLJ paper versions
Expand All @@ -86,7 +89,8 @@ def PerceptronTagger():
feature_presets['symbolic'] = [
'concept_features', 'relational_features_token', 'topic_features',
'keyword_features', 'relational_features_char', 'char_features',
'word_features', 'pos_features', 'relational_features_bigram'
'word_features', 'pos_features', 'relational_features_bigram',
'word_graph'
]

if not contextual_feature_library:
Expand Down Expand Up @@ -466,6 +470,10 @@ def get_features(df_data,

topic_features = TopicDocs(ndim=embedding_dim)

word_graph = WordGraph(fast=True, window_size=2,
sample_ratio=0.3,
repeats=5)

concept_features_transformer = ConceptFeatures(
max_features=max_num_feat, knowledge_graph=memory_location)

Expand Down Expand Up @@ -570,7 +578,13 @@ def get_features(df_data,
contextual_features),
('normalize',
Normalizer(norm=normalization_norm))
]))
])),
"word_graph": ('word_graph',
pipeline.Pipeline([
('s10', text_col(key='no_stopwords')),
('word_graph', word_graph),
('normalize', Normalizer(norm=normalization_norm))
])),
}

if include_image_transformer:
Expand Down

0 comments on commit 9aeb757

Please sign in to comment.