Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(fix): obs filtering #148

Merged
merged 17 commits into from
Oct 16, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Deduplicate filter_obs/filter_var code
gtca committed Oct 16, 2024

Verified

This commit was signed with the committer’s verified signature.
gtca Danila
commit 1909f5190e32741799b684474563233198e7b06f
258 changes: 90 additions & 168 deletions muon/_core/preproc.py
Original file line number Diff line number Diff line change
@@ -657,23 +657,24 @@ def intersect_obs(mdata: MuData):
return


# Utility functions: filtering observations
# Utility functions: filtering observations or variables


def filter_obs(
data: Union[AnnData, MuData], var: Union[str, Sequence[str]], func: Optional[Callable] = None
def _filter_attr(
data: Union[AnnData, MuData],
attr: Literal["obs", "var"],
key: Union[str, Sequence[str]],
func: Optional[Callable] = None,
) -> None:
"""
Filter observations (samples or cells) in-place
using any column in .obs or in .X.
Filter observations or variables in-place.

Parameters
----------
data: AnnData or MuData
AnnData or MuData object
var: str or Sequence[str]
Column name in .obs or in .X to be used for filtering.
Alternatively, obs_names can be provided directly.
key: str or Sequence[str]
Names or key to filter
func
Function to apply to the variable used for filtering.
If the variable is of type boolean and func is an identity function,
@@ -694,59 +695,76 @@ def filter_obs(
"MuData object is backed. The requested subset of the .X matrices of its modalities will be read into memory, and the object will not be backed anymore."
)

if isinstance(var, str):
if var in data.obs.columns:
assert attr in ("obs", "var"), "Attribute has to be either 'obs' or 'var'."

df = getattr(data, attr)
names = getattr(data, f"{attr}_names")
other = "obs" if attr == "var" else "var"
other_names = getattr(data, f"{other}_names")
attrm = getattr(data, f"{attr}m")
attrp = getattr(data, f"{attr}p")

if isinstance(key, str):
if key in df.columns:
if func is None:
if data.obs[var].dtypes.name == "bool":
if df[key].dtypes.name == "bool":

def func(x):
return x

else:
raise ValueError(f"Function has to be provided since {var} is not boolean")
obs_subset = func(data.obs[var].values)
elif var in data.var_names:
obs_subset = func(data.X[:, np.where(data.var_names == var)[0]].reshape(-1))
raise ValueError(f"Function has to be provided since {key} is not boolean")
subset = func(df[key].values)
elif key in other_names:
if attr == "obs":
subset = func(data.X[:, np.where(other_names == key)[0]].reshape(-1))
else:
subset = func(data.X[np.where(other_names == key)[0], :].reshape(-1))
else:
raise ValueError(
f"Column name from .obs or one of the var_names was expected but got {var}."
f"Column name from .{attr} or one of the {other}_names was expected but got {key}."
)
else:
if func is None:
if np.array(var).dtype == bool:
obs_subset = np.array(var)
if np.array(key).dtype == bool:
subset = np.array(key)
else:
obs_subset = data.obs_names.isin(var)
subset = names.isin(key)
else:
raise ValueError("When providing obs_names directly, func has to be None.")
raise ValueError(f"When providing {attr}_names directly, func has to be None.")

if isinstance(data, AnnData):
# Collect elements to subset
# NOTE: accessing them after subsetting .obs
# NOTE: accessing them after subsetting .obs/.var
# will fail due to _validate_value()
obsm = dict(data.obsm)
obsp = dict(data.obsp)
attrm = dict(attrm)
attrp = dict(attrp)

# Subset .obs
data._obs = data.obs[obs_subset]
data._n_obs = data.obs.shape[0]
# Subset .obs/.var
setattr(data, f"_{attr}", df[subset])

# Subset .obsm
for k, v in obsm.items():
obsm[k] = v[obs_subset]
data.obsm = obsm
# Subset .obsm/.varm
for k, v in attrm.items():
attrm[k] = v[subset]
setattr(data, f"{attr}m", attrm)

# Subset .obsp
for k, v in obsp.items():
obsp[k] = v[obs_subset][:, obs_subset]
data.obsp = obsp
# Subset .obsp/.obsp
for k, v in attrp.items():
attrp[k] = v[subset][:, subset]
setattr(data, f"{attr}p", attrp)

# Subset .X
if data._X is not None:
try:
data._X = data.X[obs_subset, :]
if attr == "obs":
data._X = data.X[subset, :]
else:
data._X = data.X[:, subset]
except TypeError:
data._X = data.X[np.where(obs_subset)[0], :]
if attr == "obs":
data._X = data.X[np.where(subset)[0], :]
else:
data._X = data.X[:, np.where(subset)[0]]
# For some h5py versions, indexing arrays must have integer dtypes
# https://github.com/h5py/h5py/issues/1847

@@ -756,165 +774,69 @@ def func(x):

# Subset layers
for layer in data.layers:
data.layers[layer] = data.layers[layer][obs_subset, :]
if attr == "obs":
data.layers[layer] = data.layers[layer][subset, :]
else:
data.layers[layer] = data.layers[layer][:, subset]

# Subset raw
if data.raw is not None:
data.raw._X = data.raw.X[obs_subset, :]
data.raw._n_obs = data.raw.X.shape[0]
# Subset raw - only when subsetting obs
if attr == "obs" and data.raw is not None:
data.raw._X = data.raw.X[subset, :]

else:
# Subset .obs
data._obs = data.obs[obs_subset]
data._n_obs = data.obs.shape[0]
attrmap = getattr(data, f"{attr}map")

# Subset .obs/.var
setattr(data, f"_{attr}", df[subset])

# Subset .obsm
for k, v in data.obsm.items():
data.obsm[k] = v[obs_subset]
# Subset .obsm/.varm
for k, v in attrm.items():
attrm[k] = v[subset]
setattr(data, f"{attr}m", attrm)

# Subset .obsp
for k, v in data.obsp.items():
data.obsp[k] = v[obs_subset][:, obs_subset]
# Subset .obsp/.varp
for k, v in attrp.items():
attrp[k] = v[subset][:, subset]
setattr(data, f"{attr}p", attrp)

# filter_obs() for each modality
# _filter_attr() for each modality
for m, mod in data.mod.items():
obsmap = data.obsmap[m][obs_subset]
obsidx = obsmap > 0
orig_obs = mod.obs.copy()
filter_obs(mod, mod.obs_names[obsmap[obsidx] - 1])
data.mod[m]._remove_unused_categories(orig_obs, mod.obs, mod.uns)
maporder = np.argsort(obsmap[obsidx])
map_subset = attrmap[m][subset]
attridx = map_subset > 0
orig_attr = getattr(mod, attr).copy()
mod_names = getattr(mod, f"{attr}_names")
_filter_attr(mod, attr, mod_names[map_subset[attridx] - 1])
data.mod[m]._remove_unused_categories(orig_attr, getattr(mod, attr), mod.uns)
maporder = np.argsort(map_subset[attridx])
nobsmap = np.empty(maporder.size)
nobsmap[maporder] = np.arange(1, maporder.size + 1)
obsmap[obsidx] = nobsmap
data.obsmap[m] = obsmap
map_subset[attridx] = nobsmap
getattr(data, f"{attr}map")[m] = map_subset

return


# Utility functions: filtering variables


def filter_var(
def filter_obs(
data: Union[AnnData, MuData], var: Union[str, Sequence[str]], func: Optional[Callable] = None
):
) -> None:
"""
Filter variables (features, e.g. genes) in-place
using any column in .var or row in .X.
Filter observations (samples or cells) in-place
using any column in .obs or in .X.

Parameters
----------
data: AnnData or MuData
AnnData or MuData object
var: str or Sequence[str]
Column name in .var or row name in .X to be used for filtering.
Alternatively, var_names can be provided directly.
Column name in .obs or in .X to be used for filtering.
Alternatively, obs_names can be provided directly.
func
Function to apply to the variable used for filtering.
If the variable is of type boolean and func is an identity function,
the func argument can be omitted.
"""

if data.is_view:
raise ValueError(
"The provided adata is a view. In-place filtering does not operate on views."
)
if data.isbacked:
if isinstance(data, AnnData):
warnings.warn(
"AnnData object is backed. The requested subset of the matrix .X will be read into memory, and the object will not be backed anymore."
)
else:
warnings.warn(
"MuData object is backed. The requested subset of the .X matrices of its modalities will be read into memory, and the object will not be backed anymore."
)

if isinstance(var, str):
if var in data.var.columns:
if func is None:
if data.var[var].dtypes.name == "bool":

def func(x):
return x

else:
raise ValueError(f"Function has to be provided since {var} is not boolean")
var_subset = func(data.var[var].values)
elif var in data.obs_names:
var_subset = func(data.X[:, np.where(data.obs_names == var)[0]].reshape(-1))
else:
raise ValueError(
f"Column name from .var or one of the obs_names was expected but got {var}."
)
else:
if func is None:
var_subset = var if np.array(var).dtype == bool else data.var_names.isin(var)
else:
raise ValueError("When providing var_names directly, func has to be None.")

if isinstance(data, AnnData):
# Collect elements to subset
# NOTE: accessing them after subsetting .var
# will fail due to _validate_value()
varm = dict(data.varm)
varp = dict(data.varp)

# Subset .obs
data._var = data.var[var_subset]
data._n_vars = data.var.shape[0]

# Subset .obsm
for k, v in varm.items():
varm[k] = v[var_subset]
data.varm = varm

# Subset .obsp
for k, v in varp.items():
varp[k] = v[var_subset][:, var_subset]
data.varp = varp

# Subset .X
try:
data._X = data.X[:, var_subset]
except TypeError:
data._X = data.X[:, np.where(var_subset)[0]]
# For some h5py versions, indexing arrays must have integer dtypes
# https://github.com/h5py/h5py/issues/1847
if data.isbacked:
data.file.close()
data.filename = None

# Subset layers
for layer in data.layers:
data.layers[layer] = data.layers[layer][:, var_subset]

# NOTE: .raw is not subsetted

else:
# Subset .var
data._var = data.var[var_subset]
data._n_vars = data.var.shape[0]

# Subset .varm
for k, v in data.varm.items():
data.varm[k] = v[var_subset]

# Subset .varp
for k, v in data.varp.items():
data.varp[k] = v[var_subset][:, var_subset]

# filter_var() for each modality
for m, mod in data.mod.items():
varmap = data.varmap[m][var_subset]
varidx = varmap > 0
orig_var = mod.var.copy()
filter_var(mod, mod.var_names[varmap[varidx] - 1])
data.mod[m]._remove_unused_categories(orig_var, mod.var, mod.uns)
maporder = np.argsort(varmap[varidx])
nvarmap = np.empty(maporder.size)
nvarmap[maporder] = np.arange(1, maporder.size + 1)
varmap[varidx] = nvarmap
data.varmap[m] = varmap
_filter_attr(data, "obs", var, func)

return