Skip to content

Commit

Permalink
file backing improvements
Browse files Browse the repository at this point in the history
* writing changes in a backed MuData object now works the same as in
  AnnData (using mdata.write())
* assigning a filename to a MuData object now works the same as in
  AnnData: for backed files, the backing file will be renamed,
  non-backing files will be written to the set file and the MuData will be
  set to backed mode
  • Loading branch information
ilia-kats committed Jun 4, 2021
1 parent 1c04a3d commit 382bf75
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 103 deletions.
45 changes: 32 additions & 13 deletions muon/_core/file_backing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from os import PathLike
from os.path import abspath
from typing import Optional
from typing import Optional, Iterator
from collections import defaultdict
from weakref import WeakSet

Expand All @@ -17,24 +18,27 @@ def __init__(
):
self._counter = 0
self._children = WeakSet()
super().__init__(None, abspath(filename), filemode)
if filename is not None:
filename = Path(filename)
super().__init__(None, filename, filemode)

def open(
self,
filename: Optional[PathLike] = None,
filemode: Optional[ad.compat.Literal["r", "r+"]] = None,
add_ref=False,
) -> bool:
if self._file is not None and (
if self.is_open and (
filename is None
and filemode is None
or filename == self.filename
and filemode == self._filemode
and self._file.id
):
self._counter += 1
if add_ref:
self.counter += 1
return False

if self._file is not None and self._file.id:
if self.is_open:
self._file.close()

if filename is not None:
Expand All @@ -44,9 +48,9 @@ def open(
if self.filename is None:
raise ValueError("Cannot open backing file if backing not initialized")
self._file = h5py.File(self.filename, self._filemode)
self._counter = 1
self._counter = int(add_ref)
for child in self._children:
child._file = self._file["mod"][child._mod]
child._set_file()
return True

def close(self):
Expand All @@ -56,8 +60,8 @@ def close(self):
def _close(self):
if self._counter > 0:
self._counter -= 1
if self._counter == 0:
self._file.close()
if self._counter == 0 and self.is_open:
self._file.close()

def _to_memory_mode(self):
for m in self._children:
Expand All @@ -68,7 +72,7 @@ def _to_memory_mode(self):

@property
def is_open(self) -> bool:
return (self._file is not None) & bool(self._file.id)
return (self._file is not None) and bool(self._file.id)

@AnnDataFileManager.filename.setter
def filename(self, filename: Optional[PathLike]):
Expand All @@ -87,16 +91,31 @@ def __init__(
self._parent = parent
self._mod = mod
parent._children.add(self)
super().__init__(adata, parent.filename, parent._filemode)
super().__init__(adata)

if parent.is_open:
self._set_file()

def open(
self,
filename: Optional[PathLike] = None,
filemode: Optional[ad.compat.Literal["r", "r+"]] = None,
):
if not self._parent.open(filename, filemode):
if not self._parent.open(filename, filemode, add_ref=True):
self._set_file()

def _set_file(self):
if self._parent.is_open:
self._file = self._parent._file["mod"][self._mod]

@property
def filename(self) -> Path:
return self._parent.filename

@filename.setter
def filename(self, fname: PathLike):
pass # the setter is needed because it's used in ad._core.file_backing.AnnDataFileManager.__init__

def close(self):
self._parent._close()

Expand Down
158 changes: 78 additions & 80 deletions muon/_core/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,74 +113,82 @@ def read_10x_mtx(path: PathLike, extended: bool = True, *args, **kwargs) -> MuDa
#


def write_h5mu(filename: PathLike, mdata: MuData, **kwargs):
"""
Write MuData object to the HDF5 file
Matrices - sparse or dense - are currently stored as they are.
"""
def _write_h5mu(file: h5py.File, mdata: MuData, write_data=True, **kwargs):
from anndata._io.utils import write_attribute
from .. import __version__, __mudataversion__, __anndataversion__

with h5py.File(filename, "a", userblock_size=512) as f:
write_attribute(
f,
"obs",
mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False)),
dataset_kwargs=kwargs,
)
write_attribute(
f,
"var",
mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False)),
dataset_kwargs=kwargs,
)
write_attribute(f, "obsm", mdata.obsm, dataset_kwargs=kwargs)
write_attribute(f, "varm", mdata.varm, dataset_kwargs=kwargs)
write_attribute(f, "obsp", mdata.obsp, dataset_kwargs=kwargs)
write_attribute(f, "varp", mdata.varp, dataset_kwargs=kwargs)
write_attribute(f, "uns", mdata.uns, dataset_kwargs=kwargs)

write_attribute(f, "obsmap", mdata.obsmap, dataset_kwargs=kwargs)
write_attribute(f, "varmap", mdata.varmap, dataset_kwargs=kwargs)
# Remove modalities if they exist
if "mod" in f:
del f["mod"]
mod = f.create_group("mod")
for k, v in mdata.mod.items():
group = mod.create_group(k)

adata = mdata.mod[k]

adata.strings_to_categoricals()
if adata.raw is not None:
adata.strings_to_categoricals(adata.raw.var)
write_attribute(
file,
"obs",
mdata.strings_to_categoricals(mdata._shrink_attr("obs", inplace=False)),
dataset_kwargs=kwargs,
)
write_attribute(
file,
"var",
mdata.strings_to_categoricals(mdata._shrink_attr("var", inplace=False)),
dataset_kwargs=kwargs,
)
write_attribute(file, "obsm", mdata.obsm, dataset_kwargs=kwargs)
write_attribute(file, "varm", mdata.varm, dataset_kwargs=kwargs)
write_attribute(file, "obsp", mdata.obsp, dataset_kwargs=kwargs)
write_attribute(file, "varp", mdata.varp, dataset_kwargs=kwargs)
write_attribute(file, "uns", mdata.uns, dataset_kwargs=kwargs)

write_attribute(file, "obsmap", mdata.obsmap, dataset_kwargs=kwargs)
write_attribute(file, "varmap", mdata.varmap, dataset_kwargs=kwargs)

mod = file.require_group("mod")
for k, v in mdata.mod.items():
group = mod.require_group(k)

adata = mdata.mod[k]

adata.strings_to_categoricals()
if adata.raw is not None:
adata.strings_to_categoricals(adata.raw.var)

if write_data:
write_attribute(group, "X", adata.X, dataset_kwargs=kwargs)
if adata.raw is not None:
write_h5ad_raw(group, "raw", adata.raw)

write_attribute(group, "obs", adata.obs, dataset_kwargs=kwargs)
write_attribute(group, "var", adata.var, dataset_kwargs=kwargs)
write_attribute(group, "obsm", adata.obsm, dataset_kwargs=kwargs)
write_attribute(group, "varm", adata.varm, dataset_kwargs=kwargs)
write_attribute(group, "obsp", adata.obsp, dataset_kwargs=kwargs)
write_attribute(group, "varp", adata.varp, dataset_kwargs=kwargs)
write_attribute(group, "layers", adata.layers, dataset_kwargs=kwargs)
write_attribute(group, "uns", adata.uns, dataset_kwargs=kwargs)

attrs = group.attrs
attrs["encoding-type"] = "AnnData"
attrs["encoding-version"] = __anndataversion__
attrs["encoder"] = "muon"
attrs["encoder-version"] = __version__

attrs = f.attrs
attrs["encoding-type"] = "MuData"
attrs["encoding-version"] = __mudataversion__
if adata.raw is not None:
write_h5ad_raw(group, "raw", adata.raw)

write_attribute(group, "obs", adata.obs, dataset_kwargs=kwargs)
write_attribute(group, "var", adata.var, dataset_kwargs=kwargs)
write_attribute(group, "obsm", adata.obsm, dataset_kwargs=kwargs)
write_attribute(group, "varm", adata.varm, dataset_kwargs=kwargs)
write_attribute(group, "obsp", adata.obsp, dataset_kwargs=kwargs)
write_attribute(group, "varp", adata.varp, dataset_kwargs=kwargs)
write_attribute(group, "layers", adata.layers, dataset_kwargs=kwargs)
write_attribute(group, "uns", adata.uns, dataset_kwargs=kwargs)

attrs = group.attrs
attrs["encoding-type"] = "AnnData"
attrs["encoding-version"] = __anndataversion__
attrs["encoder"] = "muon"
attrs["encoder-version"] = __version__

attrs = file.attrs
attrs["encoding-type"] = "MuData"
attrs["encoding-version"] = __mudataversion__
attrs["encoder"] = "muon"
attrs["encoder-version"] = __version__

# Restore top-level annotation
if not mdata.is_view or not mdata.isbacked:
mdata.update()


def write_h5mu(filename: PathLike, mdata: MuData, **kwargs):
"""
Write MuData object to the HDF5 file
Matrices - sparse or dense - are currently stored as they are.
"""
from .. import __version__, __mudataversion__, __anndataversion__

with h5py.File(filename, "w", userblock_size=512) as f:
_write_h5mu(f, mdata, **kwargs)
with open(filename, "br+") as f:
nbytes = f.write(
f"MuData (format-version={__mudataversion__};creator=muon;creator-version={__version__})".encode(
Expand All @@ -190,9 +198,6 @@ def write_h5mu(filename: PathLike, mdata: MuData, **kwargs):
f.write(
b"\0" * (512 - nbytes)
) # this is only needed because the H5file was written in append mode
# Restore top-level annotation
if not mdata.is_view or not mdata.isbacked:
mdata.update()


def write_h5ad(filename: PathLike, mod: str, data: Union[MuData, AnnData]):
Expand Down Expand Up @@ -352,8 +357,7 @@ def read_h5mu(filename: PathLike, backed: Union[str, bool, None] = None):
mode = "r"
else:
mode = backed
if backed:
manager = MuDataFileManager(filename, mode)
manager = MuDataFileManager(filename, mode) if backed else MuDataFileManager()
with open(filename, "rb") as f:
ish5mu = f.read(6) == b"MuData"
if not ish5mu:
Expand All @@ -369,30 +373,23 @@ def read_h5mu(filename: PathLike, backed: Union[str, bool, None] = None):
for k in f.keys():
if k in ["obs", "var"]:
d[k] = read_dataframe(f[k])
elif backed and k == "mod":
mods = {}
gmods = f[k]
for m in gmods.keys():
mods[m] = _read_h5mu_mod_backed(gmods[m], manager)
d[k] = mods
elif k == "mod":
if k == "mod":
mods = {}
gmods = f[k]
for m in gmods.keys():
mods[m] = read_h5ad(filename, m)
ad = _read_h5mu_mod_backed(gmods[m]) if backed else read_h5ad(filename, m)
ad.file = AnnDataFileManager(ad, m, manager)
mods[m] = ad
d[k] = mods
else:
d[k] = read_attribute(f[k])

mu = MuData._init_from_dict_(**d)
if backed:
mu.filename = filename
mu.filemode = mode
mu.file = manager
mu.file = manager
return mu


def _read_h5mu_mod_backed(g: "h5py.Group", manager: MuDataFileManager) -> dict:
def _read_h5mu_mod_backed(g: "h5py.Group") -> dict:
from anndata._io.utils import read_attribute
from anndata._io.h5ad import read_dataframe, _read_raw
from anndata import Raw
Expand All @@ -414,7 +411,6 @@ def _read_h5mu_mod_backed(g: "h5py.Group", manager: MuDataFileManager) -> dict:
elif k != "raw":
d[k] = read_attribute(g[k])
ad = AnnData(**d)
ad.file = AnnDataFileManager(ad, os.path.basename(g.name), manager)

raw = _read_raw(g, attrs={"var", "varm"})
if raw:
Expand Down Expand Up @@ -462,7 +458,9 @@ def read_h5ad(
with h5py.File(filename, hdf5_mode) as f_root:
f = f_root["mod"][mod]
if backed:
return _read_h5mu_mod_backed(f, manager)
ad = _read_h5mu_mod_backed(f, manager)
ad.file = AnnDataFileManager(ad, mod, manager)
return ad

for k in f.keys():
if k in ["obs", "var"]:
Expand Down
Loading

0 comments on commit 382bf75

Please sign in to comment.