From 9aeb757d05cad7eb679803cdd1ba685d616f78ca Mon Sep 17 00:00:00 2001 From: bkolosk1 Date: Mon, 20 Jan 2025 14:05:51 +0100 Subject: [PATCH] Add network based features. --- autoBOTLib/features/features_word_graph.py | 155 ++++++++++++++++++ .../optimization_feature_constructors.py | 22 ++- 2 files changed, 173 insertions(+), 4 deletions(-) create mode 100644 autoBOTLib/features/features_word_graph.py diff --git a/autoBOTLib/features/features_word_graph.py b/autoBOTLib/features/features_word_graph.py new file mode 100644 index 0000000..a9e84d2 --- /dev/null +++ b/autoBOTLib/features/features_word_graph.py @@ -0,0 +1,155 @@ +import logging +import random +import networkx as nx +import numpy as np +import pandas as pd +from nltk import word_tokenize +from nltk.stem import PorterStemmer +from tqdm import tqdm + +logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S") +logging.getLogger().setLevel(logging.INFO) + + +class WordGraph: + def __init__(self, fast=True, verbose=True, window_size=2, sample_ratio=0.1, repeats=10): + """ + Class initialization method. + + :param fast: Perform Monte-Carlo style feature estimation + :param window_size: what is the size for the word co-occurance graphs + :param sample_ratio: the percentage of nodes to sample for MCMS estimation + :param repeats: + + """ + self.verbose = verbose + self.fast = fast + self.window_size = window_size + self.sample_ratio = sample_ratio + self.repeats = repeats if fast else 1 + self.features = None + + def fit(self, text_list): + pass + + def transform(self, new_documents): + if not isinstance(new_documents, list): + new_documents = new_documents.values.tolist() + if self.verbose: + logging.info("[Network Features] Transforming new documents.") + data = [] + for text in tqdm(new_documents): + tokens = self.preprocess_text(text) + G = self.build_cooccurrence_graph(tokens, self.window_size) + if len(list(G.nodes)) < 2: + data.append(self.empty_features()) + continue + repeated_metrics = [ + self.compute_fast_metrics(self.sample_subgraph(G, self.sample_ratio)) + for _ in range(self.repeats) + ] + if self.repeats > 1: + aggregated = {} + for key in repeated_metrics[0]: + values = [metrics[key] for metrics in repeated_metrics] + aggregated[f"{key}_avg"] = np.mean(values) + aggregated[f"{key}_std"] = np.std(values) + aggregated[f"{key}_min"] = np.min(values) + aggregated[f"{key}_max"] = np.max(values) + data.append(list(aggregated.values())) + self.features = list(aggregated.keys()) + else: + metrics = repeated_metrics[0] + data.append(list(metrics.values())) + self.features = list(metrics.keys()) + + return np.array(data) + + def fit_transform(self, documents, b=None): + self.fit(documents) + return self.transform(documents) + + def get_feature_names_out(self): + return self.features + + def build_cooccurrence_graph(self, tokens, window_size): + G = nx.Graph() + for i, word in enumerate(tokens): + for j in range(i + 1, min(i + window_size, len(tokens))): + G.add_edge(word, tokens[j]) + return G + + def preprocess_text(self, text): + tokens = word_tokenize(text.lower()) + stemmer = PorterStemmer() + return [stemmer.stem(word) for word in tokens if word.isalnum()] + + def sample_subgraph(self, G, sample_ratio): + nodes = list(G.nodes) + sample_size = int(sample_ratio * len(nodes)) + sample_size = max(1, min(sample_size, len(nodes))) + sampled_nodes = random.sample(nodes, sample_size) + return G.subgraph(sampled_nodes) + + def compute_fast_metrics(self, G): + num_nodes = len(G.nodes) + num_edges = len(G.edges) + density = nx.density(G) + avg_degree = sum(dict(G.degree()).values()) / num_nodes if num_nodes > 0 else 0 + avg_clustering = nx.average_clustering(G) if num_nodes > 0 else 0 + + if num_nodes > 0: + betweenness = nx.betweenness_centrality(G, normalized=True) + avg_betweenness = np.mean(list(betweenness.values())) + else: + avg_betweenness = 0 + + if num_nodes > 0: + largest_cc = max(nx.connected_components(G), key=len) + subG = G.subgraph(largest_cc) + closeness = nx.closeness_centrality(subG) + avg_closeness = np.mean(list(closeness.values())) + else: + avg_closeness = 0 + + num_components = nx.number_connected_components(G) + + if num_nodes > 0: + largest_cc_size = max(len(comp) for comp in nx.connected_components(G)) + else: + largest_cc_size = 0 + + if num_nodes > 0: + pagerank = nx.pagerank(G) + avg_pagerank = np.mean(list(pagerank.values())) + max_pagerank = np.max(list(pagerank.values())) + else: + avg_pagerank = 0 + max_pagerank = 0 + + return { + "num_nodes": num_nodes, + "num_edges": num_edges, + "density": density, + "avg_degree": avg_degree, + "avg_clustering": avg_clustering, + "avg_betweenness": avg_betweenness, + "avg_closeness": avg_closeness, + "num_components": num_components, + "largest_cc_size": largest_cc_size, + "avg_pagerank": avg_pagerank, + "max_pagerank": max_pagerank, + } + + def empty_features(self): + return [0] * 11 * (4 if self.fast else 1) + + +if __name__ == "__main__": + df = pd.read_csv("../../data/insults/train.tsv", sep="\t") + example_text = df["text_a"] + labels = df["label"].tolist() + clx = WordGraph(fast=True, window_size=2, sample_ratio=0.1, repeats=10) + sim_features = clx.fit_transform(example_text) + print(clx.get_feature_names_out()) + print(sim_features.shape) diff --git a/autoBOTLib/optimization/optimization_feature_constructors.py b/autoBOTLib/optimization/optimization_feature_constructors.py index 9bcc93d..d882063 100644 --- a/autoBOTLib/optimization/optimization_feature_constructors.py +++ b/autoBOTLib/optimization/optimization_feature_constructors.py @@ -19,6 +19,7 @@ from autoBOTLib.features.features_token_relations import * from autoBOTLib.features.features_contextual import * from autoBOTLib.features.features_images import * +from autoBOTLib.features.features_word_graph import * import string import re @@ -61,7 +62,8 @@ def PerceptronTagger(): 'concept_features', 'document_graph', 'relational_features_token', 'topic_features', 'keyword_features', 'relational_features_char', 'char_features', 'word_features', 'relational_features_bigram', - 'contextual_features' + 'contextual_features', + 'word_graph' ] # This one is ~language agnostic @@ -69,7 +71,8 @@ def PerceptronTagger(): 'document_graph', 'neural_features_dbow', 'neural_features_dm', 'topic_features', 'keyword_features', 'relational_features_char', 'relational_features_token', 'char_features', 'word_features', - 'relational_features_bigram', 'concept_features' + 'relational_features_bigram', 'concept_features', + 'word_graph' ] # MLJ paper versions @@ -86,7 +89,8 @@ def PerceptronTagger(): feature_presets['symbolic'] = [ 'concept_features', 'relational_features_token', 'topic_features', 'keyword_features', 'relational_features_char', 'char_features', - 'word_features', 'pos_features', 'relational_features_bigram' + 'word_features', 'pos_features', 'relational_features_bigram', + 'word_graph' ] if not contextual_feature_library: @@ -466,6 +470,10 @@ def get_features(df_data, topic_features = TopicDocs(ndim=embedding_dim) + word_graph = WordGraph(fast=True, window_size=2, + sample_ratio=0.3, + repeats=5) + concept_features_transformer = ConceptFeatures( max_features=max_num_feat, knowledge_graph=memory_location) @@ -570,7 +578,13 @@ def get_features(df_data, contextual_features), ('normalize', Normalizer(norm=normalization_norm)) - ])) + ])), + "word_graph": ('word_graph', + pipeline.Pipeline([ + ('s10', text_col(key='no_stopwords')), + ('word_graph', word_graph), + ('normalize', Normalizer(norm=normalization_norm)) + ])), } if include_image_transformer: