Spatial stats

IARCbioinfo · Jan 12, 2024 · f8b29dd · f8b29dd
1 parent f29db2b
commit f8b29dd
Show file tree

Hide file tree

Showing 2 changed files with 333 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -96,8 +96,8 @@ python table_of_cells_after_segmentation.py --inputdir ~/LNENWork/Ki67InferenceP
 - The `segmentation_dir` should follow the following architecture:
     - `segmentation_dir`
         - `patient_id`
-            - `prediction_tumor_normal_TNE1983.csv`
-- The table  `prediction_tumor_normal_TNE1983.csv` contains the following information:
+            - `prediction_tumor_normal_{patient_id}.csv`
+- For example the table  `prediction_tumor_normal_TNE1983.csv` contains the following information:
 
 |file_path|PredTumorNomal                                                                                      |
 |---------|----------------------------------------------------------------------------------------------------|
@@ -106,7 +106,7 @@ python table_of_cells_after_segmentation.py --inputdir ~/LNENWork/Ki67InferenceP
 |KI67_Tiling_256_256_40x/TNE1983.svs/accept/TNE1983.svs_21505_10241.jpg|Tumor                                                                                               |
 |KI67_Tiling_256_256_40x/TNE1983.svs/accept/TNE1983.svs_18945_21505.jpg|Normal                      
 
-- The output table is stored in  `inputdir/patient_id/patient_id_cells_detected_segmented.csv` and will contains the following information:
+- The output table is stored in  `inputdir/patient_id/{patient_id}_cells_detected_segmented.csv` and will contains the following information:
 
 |x  |y                                                                                                   |label            |
 |---|----------------------------------------------------------------------------------------------------|-----------------|
@@ -115,10 +115,19 @@ python table_of_cells_after_segmentation.py --inputdir ~/LNENWork/Ki67InferenceP
 |17616.0|6663.0                                                                                              |2                |
 
 
-*Note: In this table, the label 1 corresponds to a positive cell and 2 to a negative cell.
+*Note: In this table, the label 1 corresponds to a positive cell and 2 to a negative cell.*
+
+### Step 6.2: Compute sptatial metrics according to graph theory
+- - The `graph_theory_analysis.py` script can be used to create a graph of the positive cells detected by Pathonet by connecting all the cells in a 2000 micron^2 area, according to which global and local spatial statistics are calculated.
+- Command line:
+```
+python graph_theory_analysis.py --rootdir /LNENWork/Ki67InferencePathonet --patient_id TNE1983
+```
+- This script generated the following output files in the `rootdir/patient_id` folder:
+    - `{patient_id}_2000_micron.gpickle`: graph
+    - `{patient_id}_graph_2000_micron_global_features.json`: global spatial statistics
+    - `{patient_id}_graph_2000_micron_local_features_segmented.csv`: local spatial statistics
+
 ## TO DO LIST
 
-+ :construction: Add NetworkX construction
-+ :construction: Table of cells
-+ :construction: Add Spatial statistics
 + :construction: Add presentation WSI
diff --git a/SpatialStatsGraph/graph_theory_analysis.py b/SpatialStatsGraph/graph_theory_analysis.py
@@ -0,0 +1,317 @@
+import json
+import pandas as pd
+import os
+import networkx as nx
+import numpy as np
+from scipy.spatial import distance_matrix
+import matplotlib.pyplot as plt
+from statistics import mean
+from numpy import linalg
+import pandas as pd
+import argparse
+import math
+global folder_name
+global root
+import time
+import datetime
+import pathlib
+
+
+
+def create_graph(df, max_nn_dist=102.4):
+    """ Create a graph: This function uses the coordinates of the positive cells detected by pathonet to create a graph by calculating a distance matrix. 
+    Two cells are linked (i.e. an edge is created) if the distance between them in terms of number of pixels is less than max_nn_dist.
+    Args :
+    - df (pandas data frame) : Table of cell coordinates see > table_of_cells_after_segmentation.py
+    - max_nn_dist: Maximum distance in pixels to link cells.
+    Return :
+    - G (Graph NetworkX object): Graph of positive cells detected by Pathonet for the current patient_id/WSI.
+    Note:
+    - This function has a high memory cost due to the calculation of the distance matrix; create_graph_local_search should be applied if there are many positive cells.
+    However, create_graph_local_search is slower than create_graph.
+    """
+    # Get the positive cells to the marker
+    df_pos = df[df['label'] == 1]
+    # Compute the distance matrix
+    df_coord = pd.DataFrame(df_pos, columns=['x', 'y'])
+    dist_matrix = distance_matrix(df_coord.values, df_coord.values)
+    # Created a weighted adjacency matrix
+    weighted_adj_matrix = np.zeros((dist_matrix.shape[0], dist_matrix.shape[1]))
+    for i_cell in range(dist_matrix.shape[0]):
+        # Get the distance vector for the current cell
+        c_cell = dist_matrix[i_cell,:]
+        # Get the index of all the neighboring cells of c_cell ie. if their distance is lower than max_nn_dist
+        index_sort = np.where(c_cell<max_nn_dist)[0].tolist() 
+        # Remove the current cell of the distance matrix
+        index_sort.remove(i_cell)
+        # Complete the weighted adjacency matrix in function of the neighbors found
+        if len(index_sort)> 0:
+            for ind in index_sort:
+                weighted_adj_matrix[i_cell,ind] = dist_matrix[i_cell,ind]
+    G = nx.from_numpy_array(weighted_adj_matrix)
+    return G
+
+def create_graph_local_search(max_dist, args):
+    """ create_graph_local_search: This function uses the coordinates of the positive cells detected by pathonet
+    to create a graph by searching locally for the potential neighbours of each cell. 
+    The tiles are considered one by one, and for each one we define the neighbouring tiles as the eight surrounding tiles.
+    For each neighbouring tile and for the current tile itself, the positive cells are listed. 
+    The Euclidean distances between each cell of the current tile and those of the cells included in the neighbourhood are calculated.
+    Edges are added to the graph between two cells less than max_dist pixels apart, otherwise the cell is added to the graph as an isolated cell. 
+    Args :
+        - max_nn_dist: Maximum distance in pixels to link cells.
+    Return :
+    - G (Graph NetworkX object): Graph of positive cells detected by Pathonet for the current patient_id/WSI.
+    """
+    # Create an empty graph
+    G = nx.Graph()
+    count_pos_cells = 0
+    c_json_treated  = 0
+    # Path to inference json file directory
+    tot_to_treat = len(os.listdir(os.path.join(args['rootdir'], args['patient_id'], 'accept')))
+    for c_json in os.listdir(os.path.join(args['rootdir'], args['patient_id'], 'accept')):
+        f_c_json = open(os.path.join(args['rootdir'], args['patient_id'],'accept'  ,c_json ), "r")
+        data_c_json = json.load(f_c_json)
+        # Get tile coordinates of the current tiles 
+        xul = int(c_json.split('_')[1])
+        yul = int(c_json.split('_')[-1].split('.')[0])
+        # Obtain a list of cells in the vicinity (9 tiles surrounding the current tile and itself).
+        cells_in_neighborhood = get_neighborhood_pos_cell_table(c_json, xul, yul)
+
+        # For every cell in the current tile
+        for cell in data_c_json:
+            # Define attibutes of the current cell
+            c_x = cell['x'] + xul
+            c_y = cell['y'] + yul
+            c_cell_name = str(c_x)+ '_' + str(c_y)
+            nb_neighbors_for_current_cell = 0
+            # Get its coordinates
+            if cell['label_id'] == 1:
+                c_x = cell['x'] + xul
+                c_y = cell['y'] + yul
+                # Check whether the cells listed in the neighbourhood tiles and the tile itself
+                # are less than max_dist from the current cell.
+                for i in range(cells_in_neighborhood.shape[0]):
+                    c_nx = cells_in_neighborhood.iloc[i,0]
+                    c_ny = cells_in_neighborhood.iloc[i,1]
+                    # Compute euclidian distance
+                    d = distance_euclidienne(c_x, c_nx, c_y, c_ny)
+                    # If two cells are less than max_dist apart, an edge is created between them.
+                    if d < max_dist:
+                        c_cell_name = str(c_x)+ '_' + str(c_y)
+                        c_n_cell_name = str(c_nx)+ '_' + str(c_ny)
+                        G.add_edge(c_cell_name, c_n_cell_name, weight=d)
+                        nb_neighbors_for_current_cell += 1
+                # If the cell has no neighbors add it to the graph as an isolated node
+                if nb_neighbors_for_current_cell == 0 : # Unconnected node
+                    G.add_node(c_cell_name)
+        # Progression
+        c_json_treated +=1 
+        print((c_json_treated / tot_to_treat)*100)
+
+    zero_edges = list(filter(lambda e: e[2] == 0, (e for e in G.edges.data('weight'))))
+    le_ids = list(e[:2] for e in zero_edges)
+    G.remove_edges_from(le_ids)
+    return G
+
+def get_neighborhood_pos_cell_table(c_json, xul, yul):
+    """
+    get_neighborhood_pos_cell_table: Allows to get a list of cells in the neighbouring tiles for this the 8 tiles surrounding the current tiles (c_json) 
+    are considered. This hypothesis is corrected considering an circle of 2000 micron^2.
+    Args:
+    - c_json : Current json files containing the list of detected cells
+    - xul and yul (int): - coordinated of the lower left corner of the current tile 
+    Return:
+    - cells_in_neighborhood (pandas data frame): Table of cells in the neighborhood
+    """
+    # Coordinated neibouring tiles
+    x_coords = [xul-512, xul, xul+512]
+    y_coords = [yul-512, yul, yul+512]
+    cells_in_neighborhood = pd.DataFrame(columns=['x', 'y'])
+    for x in x_coords:
+        for y in y_coords:
+            c_cells_in_neighborhood  = pd.DataFrame()
+            # Read json
+            json_name =c_json.split('_')[0] + '_' + str(x)  + '_' + str(y) + '.json'
+            if os.path.exists(os.path.join(root, folder_name, 'accept', json_name)):
+                f_c_json = open(os.path.join(root, folder_name,'accept'  ,json_name ), "r")
+                data_c_json = json.load(f_c_json)
+                # Get list of positive cells to the market
+                l_x_neighbors, l_y_neighbors = get_neighbors_coords_list(data_c_json, x, y)
+                c_cells_in_neighborhood['x'] = l_x_neighbors
+                c_cells_in_neighborhood['y'] = l_y_neighbors
+                cells_in_neighborhood = cells_in_neighborhood.append(c_cells_in_neighborhood)
+    return cells_in_neighborhood
+
+def get_neighbors_coords_list(data_c_json, x, y):
+    """
+    Obtain the list of cell coordinates in neighbouring tiles
+    Call by: get_neighborhood_pos_cell_table
+    """
+    l_x_neighbors  = []
+    l_y_neighbors = []
+    for cell in data_c_json:
+        if cell['label_id'] == 1:
+            l_x_neighbors.append(x+cell['x'])
+            l_y_neighbors.append(y+cell['y'])
+    return l_x_neighbors, l_y_neighbors
+
+
+
+def distance_euclidienne(c_x, c_nx, c_y, c_ny):
+    return math.sqrt((c_x - c_nx) ** 2 +(c_y - c_ny)**2)
+
+if __name__ == "__main__":
+
+    ## Argumennt
+    parser = argparse.ArgumentParser(description='Graph theory - compute global and local features on KI67 detected cells')
+    parser.add_argument('--rootdir', type=str,  default=  '/home/mathiane/LNENWork/PathonetCombinedDataSet2/PredBreastLNENDataset2Epoch50', help="rootdir where are the TNEXXXX_cells.csv")
+    parser.add_argument('--patient_id', type=str,    help='patient_id currently under analysis')
+    args = vars(parser.parse_args())
+    args = vars(parser.parse_args())
+    root =  args['rootdir']
+    folder_name = args['patient_id']
+    root = args['rootdir']
+
+    # Check if table of cells detected in the tumor area has been created
+    segmentation = False
+    if os.path.exists(f'{root}/{folder_name}/{patient_id_name_tneid}_cells_detected_segmented.csv'):
+        df = pd.read_csv(f'{root}/{folder_name}/{patient_id_name_tneid}_cells_detected_segmented.csv')
+        print(df)
+        segmentation = True
+    if segmentation:
+        # For an area of 2000 micron^2
+        # if 256 pixel ~= 12 micron
+        # Since r = sqrt(A/pi)
+        # r = 25.23 micron 
+        # 538.24 pixel = 25.23 micron 
+
+        for max_nn_dist in [538.24]: # You can add several radius in pixels to the list
+            # A radius of 538.25 px correspond to a circle of area of 2000 micron^2 ~= hot spot area measured by pathologists
+            max_nn_micron = '2000_micron'
+            # Test if the graph has already beeen computed
+            if  os.path.exists(f'{root}/{folder_name}/{patient_id_name}_{max_nn_micron}_segmentation.gpickle'):
+                Graph_name =  f'{root}/{folder_name}/{patient_id_name}_{max_nn_micron}_segmentation.gpickle'
+                graphs_created = True
+            else:
+                Graph_name =  f'{root}/{folder_name}/{patient_id_name}_{max_nn_micron}.gpickle'
+                graphs_created = False
+
+            # If the graph alredy exist we computed only the spatial statistics
+            if graphs_created :
+                print(f'{Graph_name}  alreafdy exist')
+                # Read the graph
+                G = nx.read_gpickle(Graph_name)
+
+                if G.number_of_nodes() == 0: # Empty graph
+                    print("The graph loaded is empty!")
+                    try:
+                        G = create_graph(df, max_nn_dist=max_nn_dist)
+                        print("Write graph")
+                        nx.write_gpickle(G, Graph_name)
+                        graphs_created = True
+                    except:
+                        print('Creation of a graph by local search ')
+                        G = create_graph_local_search( max_nn_dist, args)
+                        print("Write graph")
+                        nx.write_gpickle(G, Graph_name)
+                        graphs_created = True
+
+            # The graph have never been created
+            else: 
+                try:
+                    G = create_graph(df, max_nn_dist=max_nn_dist)
+                    print("Write graph")
+                    nx.write_gpickle(G, Graph_name)
+                    graphs_created = True
+                except:
+                    print('Creation of a graph by local search ')
+                    G = create_graph_local_search( max_nn_dist,  args)
+                    print("Write graph")
+                    nx.write_gpickle(G, Graph_name)
+                    graphs_created = True
+
+            # Compute spatial metric
+            if graphs_created:
+                ## Get global features
+                global_features = {}
+                if G.number_of_nodes() >0 :
+                    global_features['nb_nodes'] = G.number_of_nodes() # Number of nodes
+                    global_features['nb_edges'] = G.number_of_edges() # Number of edges
+                    global_features['poucent_unconnected_nodes'] =  (len(list(nx.isolates(G))) / 
+                                                                          G.number_of_nodes()) *100 # Pourcentage of unconnected nodes
+                    degrees = dict(nx.degree(G))
+                    global_features['poucent_end_nodes'] = (len([n for n in degrees if degrees[n]  ==  1]) /
+                                                                     G.number_of_nodes()) *100 # Pourcentage of end nodes
+                    global_features['size_largest_cc'] = len(max(nx.connected_components(G), 
+                                                                    key=len)) # Size of the largest connected component
+                    CG = nx.connected_components(G)
+                    global_features['avg_size_cc_norm_nb_nodes'] =  mean([len(g) for g in CG])/ G.number_of_nodes() # Connected components average size normalized by the number of nodes
+                    global_features['global_efficiency'] =  nx.global_efficiency(G)
+
+                    ## Save global features statistics 
+                    if segmentation:
+                        json_global_feature_fname = f'{root}/{folder_name}/{patient_id_name}_graph_{max_nn_micron}_global_features_segmented.json'
+                    else:
+                        json_global_feature_fname = f'{root}/{folder_name}/{patient_id_name}_graph_{max_nn_micron}_global_features.json'
+                    with open(json_global_feature_fname, 'w+') as f:
+                        json.dump(global_features, f)
+                    print('Global spatial metric written')
+
+                    # Compute local spatial statistics
+                    ## Add cell coordinates as attributes to graph nodes
+                    df_pos = df[df['label'] == 1]
+                    print("df_pos ", df_pos.shape)
+                    df_coord = pd.DataFrame(df_pos, columns=['x', 'y'])
+                    xdict = {}
+                    ydict = {}
+                    for i in range(df_coord.shape[0]):
+                        xdict[i] = df_coord.iloc[i,0]
+                        ydict[i] = df_coord.iloc[i,1]
+                    nx.set_node_attributes(G, xdict, "x_coord")
+                    nx.set_node_attributes(G, ydict, "y_coord")
+
+                    degrees = dict(nx.degree(G)) # Node degree
+                    print('Degrees Calculated')
+                    closeness_centrality =  dict(nx.closeness_centrality(G))
+                    print('closeness_centrality done')
+                    weighted_closeness_centrality = dict(nx.closeness_centrality(G, distance='weight'))
+                    print('weighted_closeness_centrality done') 
+                    pagerank_centrality = dict(nx.pagerank(G, weight= 'weight'))
+                    print('pagerank_centrality  done')
+                    # eigenvector_centrality_computed = False
+                    # try:
+                    #     eigenvector_centrality = dict(nx.eigenvector_centrality(G))
+                    #     print('eigenvector_centrality done')
+                    #     eigenvector_centrality_computed = True
+                    # except:
+                    #     print('Error with eigenvector_centrality')
+                    clustering_coeff = dict(nx.clustering(G))
+                    print('clustering_coeff done')
+                    weighted_clustering_coeff = dict(nx.clustering(G, weight='weight'))
+                    print('weighted_clustering_coeff done')
+
+                    # Save local spatial metric in a data frame 
+                    local_feature = pd.DataFrame()
+                    local_feature['x_coord'] = list(xdict.values())
+                    local_feature['y_coord'] = list(ydict.values())
+                    local_feature['degrees'] = list(degrees.values())
+                    local_feature['closeness_centrality'] = list(closeness_centrality.values())
+                    local_feature['weighted_closeness_centrality'] = list(weighted_closeness_centrality.values())
+                    local_feature['pagerank_centrality']  = list(pagerank_centrality.values())
+                    local_feature['clustering_coeff'] = list(clustering_coeff.values())
+                    local_feature['weighted_clustering_coeff'] = list(weighted_clustering_coeff.values())
+
+                    # Write local spatial metrics
+                    if segmentation:
+                        csv_local_feature_fname = f'{root}/{folder_name}/{patient_id_name}_graph_{max_nn_micron}_local_features_segmented.csv'
+                    else:
+                        csv_local_feature_fname = f'{root}/{folder_name}/{patient_id_name}_graph_{max_nn_micron}_local_features.csv'
+                    local_feature.to_csv(csv_local_feature_fname,index=False)
+                    print('Local spatial metrics  written')
+                else:
+                    print('ERROR any positive cells > Spatial statistic cannot be computed!')
+        else:
+            print('ERROR patient_id_name  ', patient_id_name)
+    else:
+        print("Table of cell detected in the tumor area not found! Run > table_of_cells_after_segmentation.py for this patient id")