From 538649000bffdde8a4ea671826cffdd1616d709a Mon Sep 17 00:00:00 2001 From: sergiomarco25 Date: Thu, 19 Dec 2024 13:58:12 +0100 Subject: [PATCH] errors_correction6 --- src/troutpy/pl/plotting.py | 133 +++++++++++++++++----------- src/troutpy/tl/NMF.py | 77 +++++++++------- src/troutpy/tl/estimate_density.py | 4 +- src/troutpy/tl/interactions.py | 2 +- src/troutpy/tl/quantify_xrna.py | 8 +- src/troutpy/tl/segmentation_free.py | 6 +- src/troutpy/tl/source_cell.py | 76 ++++++++++------ 7 files changed, 187 insertions(+), 119 deletions(-) diff --git a/src/troutpy/pl/plotting.py b/src/troutpy/pl/plotting.py index 2633c46..09083fd 100644 --- a/src/troutpy/pl/plotting.py +++ b/src/troutpy/pl/plotting.py @@ -11,9 +11,6 @@ from matplotlib.colors import Colormap, Normalize from pathlib import Path - - - def sorted_heatmap(celltype_by_feature, output_path:str='',filename:str="Heatmap_target_cells_by_gene",format='pdf',cmap='viridis',vmax=None,save=False,figsize=(10, 10)): """ Plots the heatmap of target cells by gene. @@ -195,47 +192,47 @@ def plot_crosstab(data, xvar: str = '', yvar: str = '', normalize=True, axis=1, ----------- data : pd.DataFrame Input dataset containing the variables for the cross-tabulation. - + xvar : str, optional (default: '') The variable to use on the x-axis for the cross-tabulation. - + yvar : str, optional (default: '') The variable to use on the y-axis for the cross-tabulation. - + normalize : bool, optional (default: True) Whether to normalize the cross-tabulated data (percentages). If True, the data will be normalized. - + axis : int, optional (default: 1) The axis to normalize across. Use `1` for row normalization and `0` for column normalization. - + kind : str, optional (default: 'barh') The kind of plot to generate. Options include: - 'barh': Horizontal bar plot - 'bar': Vertical bar plot - 'heatmap': Heatmap visualization - 'clustermap': Clustermap visualization - + save : bool, optional (default: True) If True, the plot will be saved to a file. - + figures_path : str, optional (default: '') The directory path where the figure should be saved. If not specified, the plot will be saved in the current directory. - + stacked : bool, optional (default: True) If True, the bar plots will be stacked. Only applicable for 'barh' and 'bar' plot kinds. - + figsize : tuple, optional (default: (6, 10)) The size of the figure for the plot (width, height). - + cmap : str, optional (default: 'viridis') The colormap to use for the plot, especially for heatmap and clustermap visualizations. - + saving_format : str, optional (default: 'pdf') The format to save the plot in. Options include 'png', 'pdf', etc. - + sortby : str, optional (default: None) The column or row to sort the cross-tabulated data by before plotting. - + Returns: -------- None @@ -314,7 +311,7 @@ def pie_of_positive(data, groupby: str = '', figures_path: str = '', save: bool """ plt.figure() - y = np.array([np.sum(data[groupby] == False), np.sum(data[groupby] )]) + y = np.array([np.sum(~data[groupby]), np.sum(data[groupby] )]) mylabels = [f"{groupby}=False", f"{groupby}=True"] plt.pie(y, labels=mylabels, colors=['#a0b7e0', '#c5e493']) @@ -326,7 +323,7 @@ def pie_of_positive(data, groupby: str = '', figures_path: str = '', save: bool def genes_over_noise(sdata, scores_by_genes,layer='extracellular_transcripts', output_path:str='',save=True,format:str='pdf'): """Function that plots log fold change per gene over noise using a boxplot. - + Parameters: - data_quantified: DataFrame containing the extracellular transcript data, including feature names and codeword categories. - scores_by_genes: DataFrame containing gene scores with feature names and log fold ratios. @@ -650,36 +647,36 @@ def paired_nmf_factors( ): """ Plots the spatial distribution of NMF factors for extracellular transcripts and cells. - + Parameters: ---------- sdata : spatial data object The spatial data object containing both extracellular and cell data. - + layer : str, optional Layer in sdata to extract the NMF data from (default: 'nmf_data'). - + n_factors : int, optional Number of NMF factors to plot (default: 5). - + figsize : tuple, optional Size of the figure for each subplot (default: (12, 6)). - + spot_size_exrna : float, optional Size of the spots for extracellular transcript scatter plot (default: 5). - + spot_size_cells : float, optional Size of the spots for cell scatter plot (default: 10). - + cmap_exrna : str, optional Colormap for the extracellular transcript NMF factors (default: 'YlGnBu'). - + cmap_cells : str, optional Colormap for the cell NMF factors (default: 'Reds'). - + vmax_exrna : str or float, optional Maximum value for extracellular transcript color scale (default: 'p99'). - + vmax_cells : str or float, optional Maximum value for cell color scale (default: None). """ @@ -797,61 +794,61 @@ def spatial_interactions( ---------- sdata : AnnData An AnnData object containing the spatial omics data, including transcript expression and cell positions. - + layer : str, optional, default: 'extracellular_transcripts_enriched' The layer in the AnnData object that contains the extracellular RNA transcript data. - + gene : str, optional, default: 'Arc' The gene of interest to be visualized in terms of its spatial interaction with source and target cells. - + gene_key : str, optional, default: 'feature_name' The column name in the AnnData object used to identify the gene. - + cell_id_key : str, optional, default: 'cell_id' The column name in the AnnData object used to identify individual cells. - + color_target : str, optional, default: 'blue' The color to be used for target cells in the plot. - + color_source : str, optional, default: 'red' The color to be used for source cells in the plot. - + color_transcript : str, optional, default: 'green' The color to be used for the RNA transcripts in the plot. - + spatial_key : str, optional, default: 'spatial' The key in the AnnData object that stores the spatial coordinates of the cells. - + img : Optional[Union[bool, Sequence]], optional, default: None A background image to overlay on the plot, such as a tissue section. Can be set to `None` to omit. - + img_alpha : Optional[float], optional, default: None The transparency level of the background image. Ignored if `img` is `None`. - + image_cmap : Optional[Colormap], optional, default: None The colormap to be used for the background image, if applicable. - + size : Optional[Union[float, Sequence[float]]], optional, default: 8 The size of the scatter plot points for the cells and transcripts. - + alpha : float, optional, default: 0.6 The transparency level for the scatter plot points. - + title : Optional[Union[str, Sequence[str]]], optional, default: None The title of the plot. If `None`, the gene name is used. - + legend_loc : Optional[str], optional, default: 'best' The location of the legend in the plot. - + figsize : Tuple[float, float], optional, default: (10, 10) The dimensions of the plot in inches. - + dpi : Optional[int], optional, default: 100 The resolution (dots per inch) for the plot. - + save : Optional[Union[str, Path]], optional, default: None The path to save the plot image. If `None`, the plot is displayed but not saved. - + **kwargs : Additional keyword arguments Any additional arguments passed to the `scatter` or `imshow` functions for customizing plot appearance. @@ -895,7 +892,7 @@ def interactions_with_arrows( cell_id_key: str = 'cell_id', color_target: str = 'blue', color_source: str = 'red', - color_transcript:str='green', + color_transcript: str = 'green', spatial_key: str = 'spatial', img: Optional[Union[bool, Sequence]] = None, img_alpha: Optional[float] = None, @@ -909,6 +906,40 @@ def interactions_with_arrows( save: Optional[Union[str, Path]] = None, **kwargs ): + """ + Visualizes interactions between source and target cells using arrows, along with transcript locations. + + The function plots arrows from source to target cells based on transcript proximity, color-coding source and target cells, and transcript locations. An optional image layer can be overlaid behind the plot. + + Parameters: + sdata (AnnData): The AnnData object containing the spatial omics data. + layer (str, optional): The key in `sdata` for the extracellular transcript layer to analyze. Default is 'extracellular_transcripts_enriched'. + gene (str, optional): The gene of interest. Default is 'Arc'. + gene_key (str, optional): The key for gene names in the data. Default is 'feature_name'. + cell_id_key (str, optional): The key for cell IDs. Default is 'cell_id'. + color_target (str, optional): Color for the target cells. Default is 'blue'. + color_source (str, optional): Color for the source cells. Default is 'red'. + color_transcript (str, optional): Color for the transcript locations. Default is 'green'. + spatial_key (str, optional): The key for spatial coordinates in `sdata`. Default is 'spatial'. + img (Optional[Union[bool, Sequence]], optional): Optional background image (e.g., tissue section) to display behind the plot. + img_alpha (Optional[float], optional): Transparency level for the background image. Default is None (no image). + image_cmap (Optional[Colormap], optional): Colormap for the image. Default is None. + size (Optional[Union[float, Sequence[float]]], optional): Size of the plotted points (cells and transcripts). Default is 8. + alpha (float, optional): Transparency level for plotted points. Default is 0.6. + title (Optional[Union[str, Sequence[str]]], optional): Title of the plot. Default is the gene name. + legend_loc (Optional[str], optional): Location of the legend on the plot. Default is 'best'. + figsize (Tuple[float, float], optional): Size of the plot. Default is (10, 10). + dpi (Optional[int], optional): Resolution of the plot. Default is 100. + save (Optional[Union[str, Path]], optional): If provided, the path where the plot will be saved. + **kwargs: Additional arguments passed to the `scatter` and `imshow` functions for customization. + + Returns: + None: The function displays or saves a plot of interactions between cells and transcripts. + + Notes: + The plot will show arrows from source to target cells, with different colors for source, target, and transcript points. + """ + # Extract relevant data transcripts = sdata.points[layer] trans_filt = transcripts[transcripts[gene_key] == gene] @@ -924,7 +955,7 @@ def interactions_with_arrows( # Plot arrows between each paired source and target cell for source, target in zip(source_cells, target_cells): if source in cell_positions.index and target in cell_positions.index: - if source!=target: + if source != target: x_start, y_start = cell_positions.loc[source, 'x'], cell_positions.loc[source, 'y'] x_end, y_end = cell_positions.loc[target, 'x'], cell_positions.loc[target, 'y'] plt.arrow(x_start, y_start, x_end - x_start, y_end - y_start, color='black', alpha=0.8, head_width=8, head_length=8) @@ -933,7 +964,7 @@ def interactions_with_arrows( plt.scatter(cell_positions['x'], cell_positions['y'], c='grey', s=0.6, alpha=alpha, **kwargs) plt.scatter(cell_positions.loc[target_cells, 'x'], cell_positions.loc[target_cells, 'y'], c=color_target, s=size, label='Target Cells', **kwargs) plt.scatter(cell_positions.loc[source_cells, 'x'], cell_positions.loc[source_cells, 'y'], c=color_source, s=size, label='Source Cells', **kwargs) - plt.scatter(trans_filt['x'], trans_filt['y'], c=color_transcript, s=size*0.4, label='Transcripts', **kwargs) + plt.scatter(trans_filt['x'], trans_filt['y'], c=color_transcript, s=size * 0.4, label='Transcripts', **kwargs) # Titles and Legends plt.title(title or gene) @@ -944,4 +975,4 @@ def interactions_with_arrows( # Save the plot if path provided if save: plt.savefig(save) - plt.show() + plt.show() \ No newline at end of file diff --git a/src/troutpy/tl/NMF.py b/src/troutpy/tl/NMF.py index ec921e1..5f9312d 100644 --- a/src/troutpy/tl/NMF.py +++ b/src/troutpy/tl/NMF.py @@ -54,25 +54,25 @@ def nmf( ---------- sdata : spatial data object Input spatial data containing transcript and bin data. - + layer : str, optional Layer name of the data that contains extracellular transcripts (default: 'extracellular_transcripts_enriched'). - + feature_key : str, optional Column name for the transcript feature (default: 'feature_name'). - + bin_key : str, optional Column name for bin IDs (default: 'bin_id'). - + density_table_key : str, optional Key to retrieve the density table from sdata (default: 'segmentation_free_table'). - + n_components : int, optional Number of components for NMF (default: 20). - + subsample_percentage : float, optional Percentage of data to use for NMF (default: 0.1). - + random_state : int, optional Random state for NMF initialization for reproducibility (default: None). @@ -93,10 +93,6 @@ def nmf( # Retrieve the segmentation-free density table else: adata_density = sdata[density_table_key] - - - - # Apply NMF to filtered data adata_nmf = apply_nmf_to_adata( adata_density, @@ -110,31 +106,52 @@ def nmf( return sdata -def apply_exrna_factors_to_cells(sdata,layer_factors='nmf_data'): - adata_extracellular_with_nmf=sdata[layer_factors] - adata_annotated_cellular=sdata['table'] - ### - H = adata_extracellular_with_nmf.uns['H_nmf'] +def apply_exrna_factors_to_cells(sdata, layer_factors='nmf_data'): + """Applies extracellular RNA (exRNA) factor loadings to cellular annotation data based on NMF factors. - # Check the number of genes in adata_annotated and spots2region_output to match gene loadings (H) + This function extracts extracellular RNA data and associated NMF factor loadings, intersects the gene annotations between the extracellular data and the cellular data, and applies the NMF factors to annotate the cellular data with exRNA-related factors. + + Parameters: + sdata (AnnData): The AnnData object containing both extracellular and cellular data. + layer_factors (str, optional): The key in `sdata` that contains the extracellular RNA data with NMF factors. Default is 'nmf_data'. + + Returns: + AnnData: The updated `sdata` object with annotated cellular data that includes the applied exRNA factors as new columns. + + Notes: + The function assumes that the extracellular RNA data is stored in `sdata[layer_factors]` and that the NMF factor loadings are stored in the `uns` attribute of the extracellular dataset as 'H_nmf'. The factor scores are added to the `obs` attribute of the cellular data. + """ + + # Extract extracellular data and cellular annotations + adata_extracellular_with_nmf = sdata[layer_factors] + adata_annotated_cellular = sdata['table'] + + # Retrieve NMF factor loadings (H matrix) from extracellular data + H = adata_extracellular_with_nmf.uns['H_nmf'] + + # Get gene names from both datasets genes_spots2region = adata_extracellular_with_nmf.var_names genes_annotated = adata_annotated_cellular.var_names - - # Get intersection of genes between the two datasets + + # Get the intersection of genes between the extracellular and cellular datasets common_genes = genes_annotated.intersection(genes_spots2region) - - # Filter both datasets to keep only common genes + + # Filter both datasets to retain only the common genes adata_annotated_cellular = adata_annotated_cellular[:, common_genes] - H_filtered = H[:, np.isin(genes_spots2region, common_genes)] # Filtered NMF gene loadings for common genes - - # Apply the NMF factors to the annotated dataset - # Calculate the new W matrix by multiplying the annotated data with the filtered H + H_filtered = H[:, np.isin(genes_spots2region, common_genes)] # Filtered NMF factor loadings for common genes + + # Apply NMF factors to the annotated cellular dataset + # Calculate the W matrix by multiplying the cellular data (X) with the filtered NMF loadings (H) W_annotated = adata_annotated_cellular.X @ H_filtered.T - - adata_annotated_cellular.obsm['factors']=pd.DataFrame(W_annotated,index=adata_annotated_cellular.obs.index) - #print(W_annotated[:, 0].shape) - # Add the factors as new columns in adata_annotated.obs + + # Store the factors in the 'obsm' attribute of the AnnData object + adata_annotated_cellular.obsm['factors'] = pd.DataFrame(W_annotated, index=adata_annotated_cellular.obs.index) + + # Add each factor as a new column in the 'obs' attribute of the cellular dataset for factor in range(W_annotated.shape[1]): adata_annotated_cellular.obs[f'NMF_factor_{factor + 1}'] = W_annotated[:, factor] - sdata['table']=adata_annotated_cellular + + # Update the 'table' in the sdata object with the annotated cellular data + sdata['table'] = adata_annotated_cellular + return sdata diff --git a/src/troutpy/tl/estimate_density.py b/src/troutpy/tl/estimate_density.py index d9a3d13..ba27419 100644 --- a/src/troutpy/tl/estimate_density.py +++ b/src/troutpy/tl/estimate_density.py @@ -10,13 +10,13 @@ def colocalization_proportion( ): """ Calculate the proportion of colocalized transcripts for each gene in the provided AnnData object. - + Parameters: - sdata: AnnData object with `.X` matrix containing the density of transcripts per gene. - outpath: The directory path where the output file should be saved. - threshold_colocalized: The threshold for considering a transcript colocalized (default is 1). - filename: The name of the output file (default is 'proportion_of_grouped_exRNA.parquet'). - + Returns: - coloc: DataFrame containing the proportion of colocalized transcripts for each gene. """ diff --git a/src/troutpy/tl/interactions.py b/src/troutpy/tl/interactions.py index 7f6907b..3c4cfb6 100644 --- a/src/troutpy/tl/interactions.py +++ b/src/troutpy/tl/interactions.py @@ -15,7 +15,7 @@ def get_number_of_communication_genes( target_proportion_threshold: float = 0.2 ) -> pd.DataFrame: """Compute the number of exchanged genes between any two cell types - + Args: source_proportions (pd.DataFrame): A data frame (Gene name x Cell Type) with proportion of cells per cell type expressing corresponding gene diff --git a/src/troutpy/tl/quantify_xrna.py b/src/troutpy/tl/quantify_xrna.py index 099d0ec..39bc1b2 100644 --- a/src/troutpy/tl/quantify_xrna.py +++ b/src/troutpy/tl/quantify_xrna.py @@ -188,9 +188,7 @@ def quantify_overexpression( percentile_threshold: float = 100, copy=False ) -> Tuple[pd.DataFrame, pd.DataFrame, float]: - """Compare counts per gene with counts per non-gene feature. We define a threshold as the 'percentile_threshold' - counts of non-gene counts (e.g. 'percentile_threshold = 100' corresponds to the maximum number of counts observed - in any non-gene feature). Any gene whose counts are above the threshold are considered overexpressed. + """Compare counts per gene with counts per non-gene feature. We define a threshold as the 'percentile_threshold' counts of non-gene counts (e.g. 'percentile_threshold = 100' corresponds to the maximum number of counts observed in any non-gene feature). Any gene whose counts are above the threshold are considered overexpressed. Args: sdata (pd.DataFrame): The spatial data object holding points and transcript data. @@ -244,9 +242,7 @@ def extracellular_enrichment(sdata, gene_id_column: str = 'feature_name', copy: """ Calculate the proportion of extracellular and intracellular transcripts for each gene and integrate results into the AnnData object. - This function computes the proportion of transcripts classified as extracellular or intracellular for each gene - and calculates additional metrics, including log fold change of extracellular to intracellular proportions. - The results are integrated into the `sdata` object under the 'xrna_metadata' layer. + This function computes the proportion of transcripts classified as extracellular or intracellular for each gene and calculates additional metrics, including log fold change of extracellular to intracellular proportions. The results are integrated into the `sdata` object under the 'xrna_metadata' layer. Parameters: ----------- diff --git a/src/troutpy/tl/segmentation_free.py b/src/troutpy/tl/segmentation_free.py index 71e7de2..bdd792f 100644 --- a/src/troutpy/tl/segmentation_free.py +++ b/src/troutpy/tl/segmentation_free.py @@ -14,11 +14,9 @@ def segmentation_free_clustering( transcript_id: str = 'transcript_id', copy: bool = False ): - """ - Perform segmentation-free clustering on transcriptomic spatial data. + """Perform segmentation-free clustering on transcriptomic spatial data. - This function clusters transcriptomic data without relying on pre-defined cell or tissue segmentations. - It supports multiple clustering methods, with Points2Regions being the default. + This function clusters transcriptomic data without relying on pre-defined cell or tissue segmentations.It supports multiple clustering methods, with Points2Regions being the default. Parameters: sdata : SpatialData diff --git a/src/troutpy/tl/source_cell.py b/src/troutpy/tl/source_cell.py index 7fc036b..7cc8045 100644 --- a/src/troutpy/tl/source_cell.py +++ b/src/troutpy/tl/source_cell.py @@ -95,8 +95,7 @@ def compute_source_cells( copy=False ): """ - Compute the source of extracellular RNA by linking detected extracellular transcripts - to specific cell types in the spatial data. + Compute the source of extracellular RNA by linking detected extracellular transcripts to specific cell types in the spatial data. Parameters: ---------- @@ -150,27 +149,55 @@ def distance_to_source_cell( ycoord='y', xcellcoord='x_centroid', ycellcoord='y_centroid', - gene_id_column='feature_name',copy=False): + gene_id_column='feature_name', + copy=False +): + """Calculates the distance between extracellular RNA transcripts and their closest source cells. + + This function computes the distance from each extracellular RNA transcript to the nearest source cell based on their spatial coordinates. The function uses a KDTree to efficiently find the closest cell to each transcript, storing the results in the `sdata` object. + + Parameters: + sdata (AnnData): The AnnData object containing both transcript and cellular data. + layer (str, optional): The layer in `sdata` containing the transcript data. Default is 'transcripts'. + xcoord (str, optional): The column name in the transcript data for the x-coordinate. Default is 'x'. + ycoord (str, optional): The column name in the transcript data for the y-coordinate. Default is 'y'. + xcellcoord (str, optional): The column name in the cellular data for the x-coordinate of cell centroids. Default is 'x_centroid'. + ycellcoord (str, optional): The column name in the cellular data for the y-coordinate of cell centroids. Default is 'y_centroid'. + gene_id_column (str, optional): The column name for the gene identifier. Default is 'feature_name'. + copy (bool, optional): Whether to return a copy of the `sdata` object with updated distances, or modify in place. Default is False. + + Returns: + AnnData or None: If `copy` is True, returns the updated `sdata` object. Otherwise, modifies `sdata` in place and returns None. + + Notes: + The function assumes that the transcript data contains a column `transcript_id` and that the cellular data contains + cell centroids for spatial coordinates. The KDTree algorithm is used to compute the closest cell for each transcript. + The resulting distances are stored in the `distance_to_source_cell` column of the `sdata` object's transcript layer, + and the closest source cell is stored in the `closest_source_cell` column. + The median distance for each gene is also added to the `xrna_metadata` in the `var` attribute of `sdata`. + """ - # transcripts + # Extract transcript and cellular data adata_bin = sdata['table'].copy() adata_bin.X = sdata['table'].layers['raw'] adata_bin.obs['x_centroid'] = [sp[0] for sp in adata_bin.obsm['spatial']] adata_bin.obs['y_centroid'] = [sp[1] for sp in adata_bin.obsm['spatial']] transcripts = sdata.points[layer].compute() extracellular_transcripts = transcripts[transcripts['extracellular']] - # Filter extracellular transcripts to those in adata_bin - #extracellular_transcripts = extracellular_transcripts[extracellular_transcripts["feature_name"].isin(adata_bin.var_names)] + + # Initialize lists to store results tranid = [] dist = [] - cellids=[] + cellids = [] + # Loop through each gene in the cellular data for gene_of_interest in tqdm(adata_bin.var_names): gene_idx = np.where(adata_bin.var_names == gene_of_interest)[0][0] - adata_filtered = adata_bin[adata_bin.X[:, gene_idx] > 0]#.copy() + adata_filtered = adata_bin[adata_bin.X[:, gene_idx] > 0] extracellular_transcripts_filtered = extracellular_transcripts[extracellular_transcripts[gene_id_column] == gene_of_interest].copy() - # Only proceed if there are positive cells for the gene of interests - if (adata_filtered.n_obs > 0) & (extracellular_transcripts_filtered.shape[0]>0) : + + # Only proceed if there are positive cells for the gene of interest + if (adata_filtered.n_obs > 0) & (extracellular_transcripts_filtered.shape[0] > 0): # Extract coordinates of cells and transcripts cell_coords = np.array([adata_filtered.obs[xcellcoord], adata_filtered.obs[ycellcoord]]).T transcript_coords = np.array([extracellular_transcripts_filtered[xcoord], extracellular_transcripts_filtered[ycoord]]).T @@ -178,30 +205,32 @@ def distance_to_source_cell( # Compute KDTree for nearest cell tree = KDTree(cell_coords) distances, closest_cells_indices = tree.query(transcript_coords, k=1) + # Append results to lists tranid.extend(extracellular_transcripts_filtered['transcript_id']) - dist.extend([d[0]for d in distances]) + dist.extend([d[0] for d in distances]) cell_ids = adata_filtered.obs['cell_id'].values[closest_cells_indices.flatten()] cellids.extend(c[0] for c in cell_ids.reshape(closest_cells_indices.shape)) - # Create a dictionary to map transcript IDs to distances + + # Create a dictionary to map transcript IDs to distances and cell IDs id2dist = dict(zip(tranid, dist)) - id2closeid = dict(zip(tranid,cellids)) + id2closeid = dict(zip(tranid, cellids)) + # Store the results in the DataFrame transcripts['distance_to_source_cell'] = transcripts['transcript_id'].map(id2dist) transcripts['closest_source_cell'] = transcripts['transcript_id'].map(id2closeid) sdata.points[layer] = sd.models.PointsModel.parse(transcripts) - # add median distance_to_source_cell - dist_to_source=transcripts.loc[:,[gene_id_column,'distance_to_source_cell']].groupby(gene_id_column).median() - dist_to_source.columns=['median_distance_to_source_cell'] - sdata['xrna_metadata'].var=sdata['xrna_metadata'].var.join(dist_to_source) + # Add median distance_to_source_cell + dist_to_source = transcripts.loc[:, [gene_id_column, 'distance_to_source_cell']].groupby(gene_id_column).median() + dist_to_source.columns = ['median_distance_to_source_cell'] + sdata['xrna_metadata'].var = sdata['xrna_metadata'].var.join(dist_to_source) return sdata.copy() if copy else None def compute_distant_cells_prop(sdata, layer='transcripts', gene_id_column='feature_name', threshold=30,copy=False): """ - Compute the proportion of transcripts for each gene that are located beyond a specified distance - from their closest source cell, and add the result to the metadata of the SpatialData object. + Compute the proportion of transcripts for each gene that are located beyond a specified distance from their closest source cell, and add the result to the metadata of the SpatialData object. Parameters ---------- @@ -212,19 +241,16 @@ def compute_distant_cells_prop(sdata, layer='transcripts', gene_id_column='featu gene_id_column : str, optional Column name in the transcript data representing gene identifiers. Default is 'feature_name'. threshold : float, optional - The distance threshold (in micrometers) to calculate the proportion of transcripts farther away - from their closest source cell. Default is 30. + The distance threshold (in micrometers) to calculate the proportion of transcripts farther away from their closest source cell. Default is 30. Returns ------- None - The function modifies the `sdata` object in place, adding the computed proportions as a new - column in `sdata['xrna_metadata'].var`. + The function modifies the `sdata` object in place, adding the computed proportions as a new column in `sdata['xrna_metadata'].var`. Notes ----- - - This function assumes that `sdata.points[layer]` contains a column `distance_to_source_cell` - with distances between transcripts and their closest source cells. + - This function assumes that `sdata.points[layer]` contains a column `distance_to_source_cell` with distances between transcripts and their closest source cells. - The resulting column is named `frac_beyond__from_source`. Example