From 46d28fdeba6762331beef0132b6b4252bb782657 Mon Sep 17 00:00:00 2001 From: Unknown Date: Tue, 29 Jun 2021 13:45:57 +0200 Subject: [PATCH 1/3] Move FingerprintDistanceGenerator.*_matrix methods to matrix module --- .../fingerprint_distance_generator.py | 87 +-------- kissim/comparison/matrix.py | 169 ++++++++++++++++++ 2 files changed, 174 insertions(+), 82 deletions(-) create mode 100644 kissim/comparison/matrix.py diff --git a/kissim/comparison/fingerprint_distance_generator.py b/kissim/comparison/fingerprint_distance_generator.py index 369ccaa8..00a83b12 100644 --- a/kissim/comparison/fingerprint_distance_generator.py +++ b/kissim/comparison/fingerprint_distance_generator.py @@ -8,10 +8,9 @@ import logging from tqdm.auto import tqdm -import numpy as np -import pandas as pd from kissim.comparison import BaseGenerator, FingerprintDistance, FeatureDistancesGenerator +from kissim.comparison import matrix from kissim.comparison.utils import format_weights logger = logging.getLogger(__name__) @@ -25,7 +24,7 @@ class FingerprintDistanceGenerator(BaseGenerator): Attributes ---------- data : pandas.DataFrame - Fingerprint distance and bit coverag for each structure pair (kinase pair). + Fingerprint distance and bit coverage for each structure pair (kinase pair). structure_kinase_ids : list of list Structure and kinase IDs for structures in dataset. """ @@ -214,23 +213,7 @@ def structure_distance_matrix(self, coverage_min=0.0): Structure distance matrix. """ - # Filter by coverage - data = self.data[self.data["bit_coverage"] >= coverage_min] - # Data for upper half of the matrix - pairs_upper = data[["structure.1", "structure.2", "distance"]] - # Data for lower half of the matrix - pairs_lower = pairs_upper.rename( - columns={"structure.1": "structure.2", "structure.2": "structure.1"} - ) - - # Concatenate upper and lower matrix data - pairs = pd.concat([pairs_upper, pairs_lower]).sort_values(["structure.1", "structure.2"]) - # Convert to matrix - matrix = pairs.pivot(columns="structure.2", index="structure.1", values="distance") - # Matrix diagonal is NaN > set to 0.0 - np.fill_diagonal(matrix.values, 0) - - return matrix + return matrix.structure_distance_matrix(self.data, coverage_min) def kinase_distance_matrix(self, by="minimum", fill_diagonal=True, coverage_min=0.0): """ @@ -257,36 +240,7 @@ def kinase_distance_matrix(self, by="minimum", fill_diagonal=True, coverage_min= Kinase distance matrix. """ - if by == "size": - fill_diagonal = False - - # Data for upper half of the matrix - pairs_upper = self.kinase_distances(by, coverage_min).reset_index()[ - ["kinase.1", "kinase.2", "distance"] - ] - # Data for lower half of the matrix - pairs_lower = pairs_upper.rename(columns={"kinase.1": "kinase.2", "kinase.2": "kinase.1"}) - - # Concatenate upper and lower matrix data - pairs = ( - pd.concat([pairs_upper, pairs_lower]) - .sort_values(["kinase.1", "kinase.2"]) - .drop_duplicates() - .reset_index(drop=True) - ) - - # Convert to matrix - matrix = pairs.pivot(columns="kinase.2", index="kinase.1", values="distance") - - if fill_diagonal: - np.fill_diagonal(matrix.values, 0) - - # If matrix contains number of structure pairs: NaN > 0, cast to int - if by == "size": - matrix = matrix.fillna(0) - matrix = matrix.astype("int64") - - return matrix + return matrix.kinase_distance_matrix(self.data, by, fill_diagonal, coverage_min) def kinase_distances(self, by="minimum", coverage_min=0.0): """ @@ -307,35 +261,4 @@ def kinase_distances(self, by="minimum", coverage_min=0.0): Fingerprint distance and coverage for kinase pairs. """ - # Filter by coverage - data = self.data[self.data["bit_coverage"] >= coverage_min].reset_index() - # Group by kinase names - structure_distances_grouped_by_kinases = data.groupby( - by=["kinase.1", "kinase.2"], sort=False - ) - - # Get distance values per kinase pair based on given condition - # Note: For min/max we'd like to know which structure pairs were selected! - by_terms = "minimum maximum mean median size std".split() - - if by == "minimum": - kinase_distances = data.iloc[ - structure_distances_grouped_by_kinases["distance"].idxmin() - ].set_index(["kinase.1", "kinase.2"]) - elif by == "maximum": - kinase_distances = data.iloc[ - structure_distances_grouped_by_kinases["distance"].idxmax() - ].set_index(["kinase.1", "kinase.2"]) - elif by == "mean": - kinase_distances = structure_distances_grouped_by_kinases.mean()[["distance"]] - elif by == "median": - kinase_distances = structure_distances_grouped_by_kinases.median()[["distance"]] - elif by == "size": - kinase_distances = structure_distances_grouped_by_kinases.size().to_frame("distance") - elif by == "std": - kinase_distances = structure_distances_grouped_by_kinases.std()[["distance"]] - kinase_distances = round(kinase_distances, 3) - else: - raise ValueError(f'Condition "by" unknown. Choose from: {", ".join(by_terms)}') - - return kinase_distances + return matrix.kinase_distances(self.data, by, coverage_min) diff --git a/kissim/comparison/matrix.py b/kissim/comparison/matrix.py new file mode 100644 index 00000000..0fc83746 --- /dev/null +++ b/kissim/comparison/matrix.py @@ -0,0 +1,169 @@ +""" +Calculates structure distance matrices and kinase distance matrices. +""" + +import numpy as np +import pandas as pd + + +def structure_distance_matrix(structure_distances, coverage_min=0.0): + """ + Get fingerprint distances for all structure pairs in the form of a matrix (DataFrame). + + Parameters + ---------- + structure_distances : pandas.DataFrame + Fingerprint distance and bit coverage for each structure pair (kinase pair). + fill : bool + Fill or fill not (default) lower triangle of distance matrix. + coverage_min : float + Returns only pairs with a user-defined minimum coverage (defaults to 0.0, i.e. no + coverage restrictions). + + Returns + ------- + pandas.DataFrame + Structure distance matrix. + """ + + data = structure_distances + + # Filter by coverage + data = data[data["bit_coverage"] >= coverage_min] + # Data for upper half of the matrix + pairs_upper = data[["structure.1", "structure.2", "distance"]] + # Data for lower half of the matrix + pairs_lower = pairs_upper.rename( + columns={"structure.1": "structure.2", "structure.2": "structure.1"} + ) + + # Concatenate upper and lower matrix data + pairs = pd.concat([pairs_upper, pairs_lower]).sort_values(["structure.1", "structure.2"]) + # Convert to matrix + matrix = pairs.pivot(columns="structure.2", index="structure.1", values="distance") + # Matrix diagonal is NaN > set to 0.0 + np.fill_diagonal(matrix.values, 0) + + return matrix + + +def kinase_distance_matrix( + structure_distances, by="minimum", fill_diagonal=True, coverage_min=0.0 +): + """ + Extract per kinase pair one distance value from the set of structure pair distance values + and return these fingerprint distances for all kinase pairs in the form of a matrix + (DataFrame). + + Parameters + ---------- + structure_distances : pandas.DataFrame + Fingerprint distance and bit coverage for each structure pair (kinase pair). + by : str + Condition on which the distance value per kinase pair is extracted from the set of + distances values per structure pair. Default: Minimum distance value. + fill_diagonal : bool + Fill diagonal with 0 (same kinase has distance of 0) by default. If `False`, diagonal + will be a experimental values calculated based on the structure pairs per kinase pair. + Is by default set to False, if `by="size"`. + coverage_min : float + Returns only pairs with a user-defined minimum coverage (defaults to 0.0, i.e. no + coverage restrictions). + + Returns + ------- + pandas.DataFrame + Kinase distance matrix. + """ + + if by == "size": + fill_diagonal = False + + # Data for upper half of the matrix + pairs_upper = kinase_distances(structure_distances, by, coverage_min).reset_index()[ + ["kinase.1", "kinase.2", "distance"] + ] + # Data for lower half of the matrix + pairs_lower = pairs_upper.rename(columns={"kinase.1": "kinase.2", "kinase.2": "kinase.1"}) + + # Concatenate upper and lower matrix data + pairs = ( + pd.concat([pairs_upper, pairs_lower]) + .sort_values(["kinase.1", "kinase.2"]) + .drop_duplicates() + .reset_index(drop=True) + ) + + # Convert to matrix + matrix = pairs.pivot(columns="kinase.2", index="kinase.1", values="distance") + + if fill_diagonal: + np.fill_diagonal(matrix.values, 0) + + # If matrix contains missing values, respective rows and columns must be dropped + column_has_missing_values = matrix.isna().any() + column_names_with_missing_values = column_has_missing_values[column_has_missing_values].index + matrix = matrix.drop(column_names_with_missing_values, axis=0).drop( + column_names_with_missing_values, axis=1 + ) + + # If matrix contains number of structure pairs: NaN > 0, cast to int + if by == "size": + matrix = matrix.astype("int64") + + return matrix + + +def kinase_distances(structure_distances, by="minimum", coverage_min=0.0): + """ + Extract per kinase pair one distance value from the set of structure pair distance values. + + Parameters + ---------- + structure_distances : pandas.DataFrame + Fingerprint distance and bit coverage for each structure pair (kinase pair). + by : str + Condition on which the distance value per kinase pair is extracted from the set of + distances values per structure pair. Default: Minimum distance value. + coverage_min : float + Returns only pairs with a user-defined minimum coverage (defaults to 0.0, i.e. no + coverage restrictions). + + Returns + ------- + pandas.DataFrame + Fingerprint distance and coverage for kinase pairs. + """ + + data = structure_distances + + # Filter by coverage + data = data[data["bit_coverage"] >= coverage_min].reset_index() + # Group by kinase names + structure_distances_grouped_by_kinases = data.groupby(by=["kinase.1", "kinase.2"], sort=False) + + # Get distance values per kinase pair based on given condition + # Note: For min/max we'd like to know which structure pairs were selected! + by_terms = "minimum maximum mean median size std".split() + + if by == "minimum": + kinase_distances = data.iloc[ + structure_distances_grouped_by_kinases["distance"].idxmin() + ].set_index(["kinase.1", "kinase.2"]) + elif by == "maximum": + kinase_distances = data.iloc[ + structure_distances_grouped_by_kinases["distance"].idxmax() + ].set_index(["kinase.1", "kinase.2"]) + elif by == "mean": + kinase_distances = structure_distances_grouped_by_kinases.mean()[["distance"]] + elif by == "median": + kinase_distances = structure_distances_grouped_by_kinases.median()[["distance"]] + elif by == "size": + kinase_distances = structure_distances_grouped_by_kinases.size().to_frame("distance") + elif by == "std": + kinase_distances = structure_distances_grouped_by_kinases.std()[["distance"]] + kinase_distances = round(kinase_distances, 3) + else: + raise ValueError(f'Condition "by" unknown. Choose from: {", ".join(by_terms)}') + + return kinase_distances From ddbd62a2eb683df49f11926d67fe9b1da6486c28 Mon Sep 17 00:00:00 2001 From: Unknown Date: Tue, 29 Jun 2021 13:53:28 +0200 Subject: [PATCH 2/3] Move check for matrix NaN values from matrix to tree module --- kissim/comparison/matrix.py | 7 ------- kissim/comparison/tree.py | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kissim/comparison/matrix.py b/kissim/comparison/matrix.py index 0fc83746..079ff81b 100644 --- a/kissim/comparison/matrix.py +++ b/kissim/comparison/matrix.py @@ -100,13 +100,6 @@ def kinase_distance_matrix( if fill_diagonal: np.fill_diagonal(matrix.values, 0) - # If matrix contains missing values, respective rows and columns must be dropped - column_has_missing_values = matrix.isna().any() - column_names_with_missing_values = column_has_missing_values[column_has_missing_values].index - matrix = matrix.drop(column_names_with_missing_values, axis=0).drop( - column_names_with_missing_values, axis=1 - ) - # If matrix contains number of structure pairs: NaN > 0, cast to int if by == "size": matrix = matrix.astype("int64") diff --git a/kissim/comparison/tree.py b/kissim/comparison/tree.py index 6c4ef850..b358bec8 100644 --- a/kissim/comparison/tree.py +++ b/kissim/comparison/tree.py @@ -122,6 +122,13 @@ def from_distance_matrix( # Curate diagonal - set to 0 np.fill_diagonal(distance_matrix.values, 0) + # If matrix contains missing values, respective rows and columns must be dropped + column_has_missing_values = distance_matrix.isna().any() + column_names_with_missing_values = column_has_missing_values[column_has_missing_values].index + distance_matrix = distance_matrix.drop(column_names_with_missing_values, axis=0).drop( + column_names_with_missing_values, axis=1 + ) + # Hierarchical clustering logger.info( f"Clustering (method: {clustering_method}) and " From fb42cabe87249f0a5eefdd8c11903e76facfc876 Mon Sep 17 00:00:00 2001 From: Unknown Date: Wed, 30 Jun 2021 08:08:54 +0200 Subject: [PATCH 3/3] Kinase matrix for size: Fill NaN with 0 (got lost during code move) --- kissim/comparison/matrix.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kissim/comparison/matrix.py b/kissim/comparison/matrix.py index 079ff81b..ff8560c6 100644 --- a/kissim/comparison/matrix.py +++ b/kissim/comparison/matrix.py @@ -102,6 +102,7 @@ def kinase_distance_matrix( # If matrix contains number of structure pairs: NaN > 0, cast to int if by == "size": + matrix = matrix.fillna(0) matrix = matrix.astype("int64") return matrix