-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraph_theory_analysis.py
executable file
·317 lines (293 loc) · 16.6 KB
/
graph_theory_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
import json
import pandas as pd
import os
import networkx as nx
import numpy as np
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
from statistics import mean
from numpy import linalg
import pandas as pd
import argparse
import math
global folder_name
global root
import time
import datetime
import pathlib
def create_graph(df, max_nn_dist=102.4):
""" Create a graph: This function uses the coordinates of the positive cells detected by pathonet to create a graph by calculating a distance matrix.
Two cells are linked (i.e. an edge is created) if the distance between them in terms of number of pixels is less than max_nn_dist.
Args :
- df (pandas data frame) : Table of cell coordinates see > table_of_cells_after_segmentation.py
- max_nn_dist: Maximum distance in pixels to link cells.
Return :
- G (Graph NetworkX object): Graph of positive cells detected by Pathonet for the current patient_id/WSI.
Note:
- This function has a high memory cost due to the calculation of the distance matrix; create_graph_local_search should be applied if there are many positive cells.
However, create_graph_local_search is slower than create_graph.
"""
# Get the positive cells to the marker
df_pos = df[df['label'] == 1]
# Compute the distance matrix
df_coord = pd.DataFrame(df_pos, columns=['x', 'y'])
dist_matrix = distance_matrix(df_coord.values, df_coord.values)
# Created a weighted adjacency matrix
weighted_adj_matrix = np.zeros((dist_matrix.shape[0], dist_matrix.shape[1]))
for i_cell in range(dist_matrix.shape[0]):
# Get the distance vector for the current cell
c_cell = dist_matrix[i_cell,:]
# Get the index of all the neighboring cells of c_cell ie. if their distance is lower than max_nn_dist
index_sort = np.where(c_cell<max_nn_dist)[0].tolist()
# Remove the current cell of the distance matrix
index_sort.remove(i_cell)
# Complete the weighted adjacency matrix in function of the neighbors found
if len(index_sort)> 0:
for ind in index_sort:
weighted_adj_matrix[i_cell,ind] = dist_matrix[i_cell,ind]
G = nx.from_numpy_array(weighted_adj_matrix)
return G
def create_graph_local_search(max_dist, args):
""" create_graph_local_search: This function uses the coordinates of the positive cells detected by pathonet
to create a graph by searching locally for the potential neighbours of each cell.
The tiles are considered one by one, and for each one we define the neighbouring tiles as the eight surrounding tiles.
For each neighbouring tile and for the current tile itself, the positive cells are listed.
The Euclidean distances between each cell of the current tile and those of the cells included in the neighbourhood are calculated.
Edges are added to the graph between two cells less than max_dist pixels apart, otherwise the cell is added to the graph as an isolated cell.
Args :
- max_nn_dist: Maximum distance in pixels to link cells.
Return :
- G (Graph NetworkX object): Graph of positive cells detected by Pathonet for the current patient_id/WSI.
"""
# Create an empty graph
G = nx.Graph()
count_pos_cells = 0
c_json_treated = 0
# Path to inference json file directory
tot_to_treat = len(os.listdir(os.path.join(args['rootdir'], args['patient_id'], 'accept')))
for c_json in os.listdir(os.path.join(args['rootdir'], args['patient_id'], 'accept')):
f_c_json = open(os.path.join(args['rootdir'], args['patient_id'],'accept' ,c_json ), "r")
data_c_json = json.load(f_c_json)
# Get tile coordinates of the current tiles
xul = int(c_json.split('_')[1])
yul = int(c_json.split('_')[-1].split('.')[0])
# Obtain a list of cells in the vicinity (9 tiles surrounding the current tile and itself).
cells_in_neighborhood = get_neighborhood_pos_cell_table(c_json, xul, yul)
# For every cell in the current tile
for cell in data_c_json:
# Define attibutes of the current cell
c_x = cell['x'] + xul
c_y = cell['y'] + yul
c_cell_name = str(c_x)+ '_' + str(c_y)
nb_neighbors_for_current_cell = 0
# Get its coordinates
if cell['label_id'] == 1:
c_x = cell['x'] + xul
c_y = cell['y'] + yul
# Check whether the cells listed in the neighbourhood tiles and the tile itself
# are less than max_dist from the current cell.
for i in range(cells_in_neighborhood.shape[0]):
c_nx = cells_in_neighborhood.iloc[i,0]
c_ny = cells_in_neighborhood.iloc[i,1]
# Compute euclidian distance
d = distance_euclidienne(c_x, c_nx, c_y, c_ny)
# If two cells are less than max_dist apart, an edge is created between them.
if d < max_dist:
c_cell_name = str(c_x)+ '_' + str(c_y)
c_n_cell_name = str(c_nx)+ '_' + str(c_ny)
G.add_edge(c_cell_name, c_n_cell_name, weight=d)
nb_neighbors_for_current_cell += 1
# If the cell has no neighbors add it to the graph as an isolated node
if nb_neighbors_for_current_cell == 0 : # Unconnected node
G.add_node(c_cell_name)
# Progression
c_json_treated +=1
print((c_json_treated / tot_to_treat)*100)
zero_edges = list(filter(lambda e: e[2] == 0, (e for e in G.edges.data('weight'))))
le_ids = list(e[:2] for e in zero_edges)
G.remove_edges_from(le_ids)
return G
def get_neighborhood_pos_cell_table(c_json, xul, yul):
"""
get_neighborhood_pos_cell_table: Allows to get a list of cells in the neighbouring tiles for this the 8 tiles surrounding the current tiles (c_json)
are considered. This hypothesis is corrected considering an circle of 2000 micron^2.
Args:
- c_json : Current json files containing the list of detected cells
- xul and yul (int): - coordinated of the lower left corner of the current tile
Return:
- cells_in_neighborhood (pandas data frame): Table of cells in the neighborhood
"""
# Coordinated neibouring tiles
x_coords = [xul-512, xul, xul+512]
y_coords = [yul-512, yul, yul+512]
cells_in_neighborhood = pd.DataFrame(columns=['x', 'y'])
for x in x_coords:
for y in y_coords:
c_cells_in_neighborhood = pd.DataFrame()
# Read json
json_name =c_json.split('_')[0] + '_' + str(x) + '_' + str(y) + '.json'
if os.path.exists(os.path.join(root, folder_name, 'accept', json_name)):
f_c_json = open(os.path.join(root, folder_name,'accept' ,json_name ), "r")
data_c_json = json.load(f_c_json)
# Get list of positive cells to the market
l_x_neighbors, l_y_neighbors = get_neighbors_coords_list(data_c_json, x, y)
c_cells_in_neighborhood['x'] = l_x_neighbors
c_cells_in_neighborhood['y'] = l_y_neighbors
cells_in_neighborhood = cells_in_neighborhood.append(c_cells_in_neighborhood)
return cells_in_neighborhood
def get_neighbors_coords_list(data_c_json, x, y):
"""
Obtain the list of cell coordinates in neighbouring tiles
Call by: get_neighborhood_pos_cell_table
"""
l_x_neighbors = []
l_y_neighbors = []
for cell in data_c_json:
if cell['label_id'] == 1:
l_x_neighbors.append(x+cell['x'])
l_y_neighbors.append(y+cell['y'])
return l_x_neighbors, l_y_neighbors
def distance_euclidienne(c_x, c_nx, c_y, c_ny):
return math.sqrt((c_x - c_nx) ** 2 +(c_y - c_ny)**2)
if __name__ == "__main__":
## Argumennt
parser = argparse.ArgumentParser(description='Graph theory - compute global and local features on KI67 detected cells')
parser.add_argument('--rootdir', type=str, default= '/home/mathiane/LNENWork/PathonetCombinedDataSet2/PredBreastLNENDataset2Epoch50', help="rootdir where are the TNEXXXX_cells.csv")
parser.add_argument('--patient_id', type=str, help='patient_id currently under analysis')
args = vars(parser.parse_args())
args = vars(parser.parse_args())
root = args['rootdir']
folder_name = args['patient_id']
root = args['rootdir']
# Check if table of cells detected in the tumor area has been created
segmentation = False
if os.path.exists(f'{root}/{folder_name}/{patient_id_name_tneid}_cells_detected_segmented.csv'):
df = pd.read_csv(f'{root}/{folder_name}/{patient_id_name_tneid}_cells_detected_segmented.csv')
print(df)
segmentation = True
if segmentation:
# For an area of 2000 micron^2
# if 256 pixel ~= 12 micron
# Since r = sqrt(A/pi)
# r = 25.23 micron
# 538.24 pixel = 25.23 micron
for max_nn_dist in [538.24]: # You can add several radius in pixels to the list
# A radius of 538.25 px correspond to a circle of area of 2000 micron^2 ~= hot spot area measured by pathologists
max_nn_micron = '2000_micron'
# Test if the graph has already beeen computed
if os.path.exists(f'{root}/{folder_name}/{patient_id_name}_{max_nn_micron}_segmentation.gpickle'):
Graph_name = f'{root}/{folder_name}/{patient_id_name}_{max_nn_micron}_segmentation.gpickle'
graphs_created = True
else:
Graph_name = f'{root}/{folder_name}/{patient_id_name}_{max_nn_micron}.gpickle'
graphs_created = False
# If the graph alredy exist we computed only the spatial statistics
if graphs_created :
print(f'{Graph_name} alreafdy exist')
# Read the graph
G = nx.read_gpickle(Graph_name)
if G.number_of_nodes() == 0: # Empty graph
print("The graph loaded is empty!")
try:
G = create_graph(df, max_nn_dist=max_nn_dist)
print("Write graph")
nx.write_gpickle(G, Graph_name)
graphs_created = True
except:
print('Creation of a graph by local search ')
G = create_graph_local_search( max_nn_dist, args)
print("Write graph")
nx.write_gpickle(G, Graph_name)
graphs_created = True
# The graph have never been created
else:
try:
G = create_graph(df, max_nn_dist=max_nn_dist)
print("Write graph")
nx.write_gpickle(G, Graph_name)
graphs_created = True
except:
print('Creation of a graph by local search ')
G = create_graph_local_search( max_nn_dist, args)
print("Write graph")
nx.write_gpickle(G, Graph_name)
graphs_created = True
# Compute spatial metric
if graphs_created:
## Get global features
global_features = {}
if G.number_of_nodes() >0 :
global_features['nb_nodes'] = G.number_of_nodes() # Number of nodes
global_features['nb_edges'] = G.number_of_edges() # Number of edges
global_features['poucent_unconnected_nodes'] = (len(list(nx.isolates(G))) /
G.number_of_nodes()) *100 # Pourcentage of unconnected nodes
degrees = dict(nx.degree(G))
global_features['poucent_end_nodes'] = (len([n for n in degrees if degrees[n] == 1]) /
G.number_of_nodes()) *100 # Pourcentage of end nodes
global_features['size_largest_cc'] = len(max(nx.connected_components(G),
key=len)) # Size of the largest connected component
CG = nx.connected_components(G)
global_features['avg_size_cc_norm_nb_nodes'] = mean([len(g) for g in CG])/ G.number_of_nodes() # Connected components average size normalized by the number of nodes
global_features['global_efficiency'] = nx.global_efficiency(G)
## Save global features statistics
if segmentation:
json_global_feature_fname = f'{root}/{folder_name}/{patient_id_name}_graph_{max_nn_micron}_global_features_segmented.json'
else:
json_global_feature_fname = f'{root}/{folder_name}/{patient_id_name}_graph_{max_nn_micron}_global_features.json'
with open(json_global_feature_fname, 'w+') as f:
json.dump(global_features, f)
print('Global spatial metric written')
# Compute local spatial statistics
## Add cell coordinates as attributes to graph nodes
df_pos = df[df['label'] == 1]
print("df_pos ", df_pos.shape)
df_coord = pd.DataFrame(df_pos, columns=['x', 'y'])
xdict = {}
ydict = {}
for i in range(df_coord.shape[0]):
xdict[i] = df_coord.iloc[i,0]
ydict[i] = df_coord.iloc[i,1]
nx.set_node_attributes(G, xdict, "x_coord")
nx.set_node_attributes(G, ydict, "y_coord")
degrees = dict(nx.degree(G)) # Node degree
print('Degrees Calculated')
closeness_centrality = dict(nx.closeness_centrality(G))
print('closeness_centrality done')
weighted_closeness_centrality = dict(nx.closeness_centrality(G, distance='weight'))
print('weighted_closeness_centrality done')
pagerank_centrality = dict(nx.pagerank(G, weight= 'weight'))
print('pagerank_centrality done')
# eigenvector_centrality_computed = False
# try:
# eigenvector_centrality = dict(nx.eigenvector_centrality(G))
# print('eigenvector_centrality done')
# eigenvector_centrality_computed = True
# except:
# print('Error with eigenvector_centrality')
clustering_coeff = dict(nx.clustering(G))
print('clustering_coeff done')
weighted_clustering_coeff = dict(nx.clustering(G, weight='weight'))
print('weighted_clustering_coeff done')
# Save local spatial metric in a data frame
local_feature = pd.DataFrame()
local_feature['x_coord'] = list(xdict.values())
local_feature['y_coord'] = list(ydict.values())
local_feature['degrees'] = list(degrees.values())
local_feature['closeness_centrality'] = list(closeness_centrality.values())
local_feature['weighted_closeness_centrality'] = list(weighted_closeness_centrality.values())
local_feature['pagerank_centrality'] = list(pagerank_centrality.values())
local_feature['clustering_coeff'] = list(clustering_coeff.values())
local_feature['weighted_clustering_coeff'] = list(weighted_clustering_coeff.values())
# Write local spatial metrics
if segmentation:
csv_local_feature_fname = f'{root}/{folder_name}/{patient_id_name}_graph_{max_nn_micron}_local_features_segmented.csv'
else:
csv_local_feature_fname = f'{root}/{folder_name}/{patient_id_name}_graph_{max_nn_micron}_local_features.csv'
local_feature.to_csv(csv_local_feature_fname,index=False)
print('Local spatial metrics written')
else:
print('ERROR any positive cells > Spatial statistic cannot be computed!')
else:
print('ERROR patient_id_name ', patient_id_name)
else:
print("Table of cell detected in the tumor area not found! Run > table_of_cells_after_segmentation.py for this patient id")