Skip to content

Commit

Permalink
Merge pull request #751 from theislab/fix/celltype-mapping
Browse files Browse the repository at this point in the history
Fix `cell_transition` bug
  • Loading branch information
selmanozleyen authored Oct 10, 2024
2 parents c981a05 + 92acfbe commit ba8d30f
Showing 1 changed file with 117 additions and 127 deletions.
244 changes: 117 additions & 127 deletions src/moscot/base/problems/_mixins.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from __future__ import annotations

import types
from functools import partial
from typing import (
TYPE_CHECKING,
Any,
Callable,
Generic,
Iterable,
Literal,
Expand Down Expand Up @@ -175,6 +177,38 @@ def _cell_transition(
)
return tm

def _annotation_aggregation_transition(
self: AnalysisMixinProtocol[K, B],
annotations_1: list[Any],
annotations_2: list[Any],
df: pd.DataFrame,
func: Callable[..., ArrayLike],
) -> pd.DataFrame:
n1 = len(annotations_1)
n2 = len(annotations_2)
tm_arr = np.zeros((n1, n2))

# Factorize annotations in df_res_annotation
codes, uniques = pd.factorize(df.values)
# Map annotations in 'annotations_2' to indices in 'uniques'
annotations_in_df_to_idx = {annotation: idx for idx, annotation in enumerate(uniques)}
annotations_2_codes = [annotations_in_df_to_idx.get(annotation, -1) for annotation in annotations_2]

for i, subset in enumerate(annotations_1):
result = func(
subset=subset,
)
# Compute sums over 'codes' weighted by 'result'
sums = np.bincount(codes, weights=result.squeeze(), minlength=len(uniques))
dist = [sums[code] if code != -1 else 0 for code in annotations_2_codes]
tm_arr[i, :] = dist

return pd.DataFrame(
tm_arr,
index=annotations_1,
columns=annotations_2,
)

def _cell_transition_online(
self: AnalysisMixinProtocol[K, B],
key: Optional[str],
Expand All @@ -196,88 +230,70 @@ def _cell_transition_online(
target_annotation_key, target_annotations, target_annotations_ordered = _validate_args_cell_transition(
self.adata if other_adata is None else other_adata, target_groups
)
new_annotation_key = "new_annotation"
df_source = _get_df_cell_transition(
self.adata,
[source_annotation_key] if aggregation_mode == "cell" else [source_annotation_key, target_annotation_key],
[source_annotation_key],
key,
source,
)
).rename(columns={source_annotation_key: new_annotation_key})
df_target = _get_df_cell_transition(
self.adata if other_adata is None else other_adata,
[target_annotation_key] if aggregation_mode == "cell" else [source_annotation_key, target_annotation_key],
[target_annotation_key],
key if other_adata is None else other_key,
target,
)

).rename(columns={target_annotation_key: new_annotation_key})
source_annotations_verified, target_annotations_verified = _validate_annotations(
df_source=df_source,
df_target=df_target,
source_annotation_key=source_annotation_key,
target_annotation_key=target_annotation_key,
source_annotation_key=new_annotation_key,
target_annotation_key=new_annotation_key,
source_annotations=source_annotations,
target_annotations=target_annotations,
aggregation_mode=aggregation_mode,
forward=forward,
)
df_to, df_from = (df_target, df_source) if forward else (df_source, df_target)
df_to = df_to[new_annotation_key]
move_op = self.push if forward else self.pull
move_op_const_kwargs = {
"source": source,
"target": target,
"normalize": True,
"return_all": False,
"scale_by_marginals": False,
"key_added": None,
}

if aggregation_mode == "annotation":
df_target["distribution"] = 0
df_source["distribution"] = 0
tm = pd.DataFrame(
np.zeros((len(source_annotations_verified), len(target_annotations_verified))),
index=source_annotations_verified,
columns=target_annotations_verified,
func = partial(
move_op,
data=source_annotation_key if forward else target_annotation_key,
split_mass=False,
**move_op_const_kwargs,
)
if forward:
tm = self._annotation_aggregation_transition( # type: ignore[attr-defined]
source=source,
target=target,
annotation_key=source_annotation_key,
annotations_1=source_annotations_verified,
annotations_2=target_annotations_verified,
df=df_target,
tm=tm,
forward=True,
)
else:
tm = self._annotation_aggregation_transition( # type: ignore[attr-defined]
source=source,
target=target,
annotation_key=target_annotation_key,
annotations_1=target_annotations_verified,
annotations_2=source_annotations_verified,
df=df_source,
tm=tm,
forward=False,
)
tm = self._annotation_aggregation_transition( # type: ignore[attr-defined]
annotations_1=source_annotations_verified if forward else target_annotations_verified,
annotations_2=target_annotations_verified if forward else source_annotations_verified,
df=df_to,
func=func,
)

elif aggregation_mode == "cell":
tm = pd.DataFrame(columns=target_annotations_verified if forward else source_annotations_verified)
if forward:
tm = self._cell_aggregation_transition( # type: ignore[attr-defined]
source=source,
target=target,
annotation_key=target_annotation_key,
annotations_1=source_annotations_verified,
annotations_2=target_annotations_verified,
df_1=df_target,
df_2=df_source,
tm=tm,
batch_size=batch_size,
forward=True,
)
else:
tm = self._cell_aggregation_transition( # type: ignore[attr-defined]
source=source,
target=target,
annotation_key=source_annotation_key,
annotations_1=target_annotations_verified,
annotations_2=source_annotations_verified,
df_1=df_source,
df_2=df_target,
tm=tm,
batch_size=batch_size,
forward=False,
)
func = partial(
move_op,
data=None,
split_mass=True,
**move_op_const_kwargs,
)
tm = self._cell_aggregation_transition( # type: ignore[attr-defined]
df_from=df_from,
df_to=df_to,
annotations=target_annotations_verified if forward else source_annotations_verified,
batch_size=batch_size,
func=func,
)

else:
raise NotImplementedError(f"Aggregation mode `{aggregation_mode!r}` is not yet implemented.")

Expand Down Expand Up @@ -476,76 +492,50 @@ def _flatten(self: AnalysisMixinProtocol[K, B], data: dict[K, ArrayLike], *, key
tmp[mask] = np.squeeze(v)
return tmp

def _annotation_aggregation_transition(
self: AnalysisMixinProtocol[K, B],
source: K,
target: K,
annotation_key: str,
annotations_1: list[Any],
annotations_2: list[Any],
df: pd.DataFrame,
tm: pd.DataFrame,
forward: bool,
) -> pd.DataFrame:
if not forward:
tm = tm.T
func = self.push if forward else self.pull
for subset in annotations_1:
result = func( # TODO(@MUCDK) check how to make compatible with all policies
source=source,
target=target,
data=annotation_key,
subset=subset,
normalize=True,
return_all=False,
scale_by_marginals=False,
split_mass=False,
key_added=None,
)
df["distribution"] = result
cell_dist = df[df[annotation_key].isin(annotations_2)].groupby(annotation_key).sum(numeric_only=True)
cell_dist /= cell_dist.sum()
tm.loc[subset, :] = [
cell_dist.loc[annotation, "distribution"] if annotation in cell_dist.distribution.index else 0
for annotation in annotations_2
]
return tm

def _cell_aggregation_transition(
self: AnalysisMixinProtocol[K, B],
source: str,
target: str,
annotation_key: str,
# TODO(MUCDK): unused variables, del below
annotations_1: list[Any],
annotations_2: list[Any],
df_1: pd.DataFrame,
df_2: pd.DataFrame,
tm: pd.DataFrame,
df_from: pd.DataFrame,
df_to: pd.DataFrame,
annotations: list[Any],
batch_size: Optional[int],
forward: bool,
func: Callable[..., ArrayLike],
) -> pd.DataFrame:
func = self.push if forward else self.pull

# Factorize annotations in df_to
annotations_in_df_to = df_to.values
codes_to, uniques_to = pd.factorize(annotations_in_df_to)
# Map annotations in 'annotations' to codes
annotations_to_code = {annotation: idx for idx, annotation in enumerate(uniques_to)}
annotations_codes = [annotations_to_code.get(annotation, -1) for annotation in annotations]
n_annotations = len(annotations)
n_from_cells = len(df_from)

if batch_size is None:
batch_size = len(df_2)
for batch in range(0, len(df_2), batch_size):
result = func( # TODO(@MUCDK) check how to make compatible with all policies
source=source,
target=target,
data=None,
subset=(batch, batch_size),
normalize=True,
return_all=False,
scale_by_marginals=False,
split_mass=True,
key_added=None,
)
current_cells = df_2.iloc[range(batch, min(batch + batch_size, len(df_2)))].index.tolist()
df_1.loc[:, current_cells] = result
to_app = df_1[df_1[annotation_key].isin(annotations_2)].groupby(annotation_key).sum().transpose()
tm = pd.concat([tm, to_app], verify_integrity=True, axis=0)
df_1 = df_1.drop(current_cells, axis=1)
return tm
batch_size = n_from_cells

tm_arr = np.zeros((n_from_cells, n_annotations))
index = df_from.index

# Process in batches
for batch_start in range(0, n_from_cells, batch_size):
batch_end = min(batch_start + batch_size, n_from_cells)
subset = (batch_start, batch_end - batch_start)
result = func(subset=subset)
# Result shape: (n_to_cells, batch_size)
# For each cell in the batch, we compute the sum over annotations
for i in range(batch_end - batch_start):
cell_distribution = result[:, i]
# Aggregate over annotations using bincount
sums = np.bincount(
codes_to,
weights=cell_distribution,
minlength=len(uniques_to),
)
# Map sums to annotations_verified_codes
dist = [sums[code] if code != -1 else 0 for code in annotations_codes]
tm_arr[batch_start + i, :] = dist

return pd.DataFrame(tm_arr, index=index, columns=annotations)

# adapted from:
# https://github.com/theislab/cellrank/blob/master/cellrank/_utils/_utils.py#L392
Expand Down

0 comments on commit ba8d30f

Please sign in to comment.