-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopology_features.py
88 lines (68 loc) · 2.25 KB
/
topology_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# %% [markdown]
# ## Import Packages
# %%
#%% Import Packages
import os
# import pathlib
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from helper.topology_functions import extract_features
# Number of avialable cores
nslots = int(os.getenv('NSLOTS', 2)) # number of cores | default is 2
print('cores:' ,nslots, ', dir:', os.getcwd()) # should be '/gpfs0/shai/users/barryb/link-predict'
# %% [markdown]
# ## Load data
# %%
print('Loading dataframe', flush=True)
df = pd.read_csv('data/processed/networks/subsamples_edge_lists.csv', header=0)
print('Loading metadata', flush=True)
df_meta = pd.read_csv('data/processed/networks/subsamples_metadata.csv', header=0)
df['lower_level'] = df['lower_level'].astype(str) # TODO: fix this in the data processing (I thought I did but there are errors)
df['higher_level'] = df['higher_level'].astype(str)
# %% [markdown]
# ## Set configurations
# %% [markdown]
# Select features
# %%
features_list = [
'network_size',
'species_ratio',
'interactions_count',
'edge_connectivity', # sometimes a bit slow
'density',
'bipartite_clustering', # slow |
'Spectral_bipartivity', # sometimes a bit slow (?)
'average_clustering',
'degree_assortativity_coefficient',
'global_efficiency', # sometimes a bit slow
'local_efficiency', # sometimes a bit slow
'connected_components',
'degree',
'latapy_clustering',
'node_redundancy', # sometimes slow |
'betweenness_centrality', # a bit slow
'degree_centrality',
'closeness_centrality',
'average_neighbor_degree',
'pagerank',
'hits_hubs',
'hits_authorities',
'isolate',
'preferential_attachment',
'shortest_path_length',
'shortest_paths_count', # slow |
'friends_measure', # slow
'same_community_infomap', # Also returns 'flow_infomap' and 'modular_centrality # slow
]
# %%
multiprocess = True # parallel processing
# %% [markdown]
# ## Extract Network Features
# %%
if multiprocess == True:
extract_features(df, features_list, nslots=nslots, table_path='data/processed/features/features_py.csv')
else:
extract_features(df, features_list, nslots=1, table_path='data/processed/features/features_py.csv')
# %%
print('Done!')