diff --git a/.gitignore b/.gitignore index e9f3571ca8..3587be0f89 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,6 @@ benchmarks/results .idea .vscode *.lock + +# dev +tmp/ \ No newline at end of file diff --git a/package/MDAnalysis/analysis/dasktimeseries.py b/package/MDAnalysis/analysis/dasktimeseries.py new file mode 100644 index 0000000000..3518fd9811 --- /dev/null +++ b/package/MDAnalysis/analysis/dasktimeseries.py @@ -0,0 +1,49 @@ +from .results import Results, ResultsGroup +import dask.array as da +import numpy as np + + +class DaskTimeSeriesAnalysisBase: + + def __init__(self, dask_timeseries, verbose=False, **kwargs): + self._dts = dask_timeseries + self._verbose = verbose + self.results = Results() + + def _prepare(self): + pass # pylint: disable=unnecessary-pass + + def _compute(self): + pass + + def _conclude(self): + pass # pylint: disable=unnecessary-pass + + def run(self): + self._prepare() + self._compute() + self._conclude() + return self + + +class DaskRMSF(DaskTimeSeriesAnalysisBase): + def __init__(self, dask_timeseries, verbose=False, **kwargs): + super().__init__(dask_timeseries, verbose=verbose, **kwargs) + + def _prepare(self): + n_atoms = len(self._dts[0]) + self.results["rmsf"] = np.zeros((n_atoms, 3)) + + def _compute(self): + positions = self._dts + mean_positions = positions.mean(axis=0) + subtracted_positions = positions - mean_positions + squared_deviations = subtracted_positions**2 + avg_squared_deviations = squared_deviations.mean(axis=0) + sqrt_avg_squared_deviations = da.sqrt(avg_squared_deviations) + self.results.rmsf = da.sqrt( + (sqrt_avg_squared_deviations**2).sum(axis=1) + ).compute() + + def _conclude(self): + pass diff --git a/package/MDAnalysis/coordinates/H5MD.py b/package/MDAnalysis/coordinates/H5MD.py index 48283113f4..38e1bab4ba 100644 --- a/package/MDAnalysis/coordinates/H5MD.py +++ b/package/MDAnalysis/coordinates/H5MD.py @@ -208,7 +208,9 @@ """ import warnings - +import dask.array as da +from .. import units +from typing import Any, Union, Optional, List, Dict import numpy as np import MDAnalysis as mda from . import base, core @@ -810,6 +812,57 @@ def Writer(self, filename, n_atoms=None, **kwargs): kwargs.setdefault('forces', self.has_forces) return H5MDWriter(filename, n_atoms, **kwargs) + def dask_timeseries(self, asel: Optional['AtomGroup']=None, + atomgroup: Optional['Atomgroup']=None, + start: Optional[int]=None, stop: Optional[int]=None, + step: Optional[int]=None, + order: Optional[str]='fac') -> np.ndarray: + if asel is not None: + warnings.warn( + "asel argument to timeseries will be renamed to" + "'atomgroup' in 3.0, see #3911", + category=DeprecationWarning) + if atomgroup: + raise ValueError("Cannot provide both asel and atomgroup kwargs") + atomgroup = asel + start, stop, step = self.check_slice_indices(start, stop, step) + nframes = len(range(start, stop, step)) + + if atomgroup is not None: + if len(atomgroup) == 0: + raise ValueError( + "Timeseries requires at least one atom to analyze") + atom_numbers = atomgroup.indices + natoms = len(atom_numbers) + else: + natoms = self.n_atoms + atom_numbers = np.arange(natoms) + + coordinates = da.from_array(self._particle_group['position']['value'],)[start:stop:step, atom_numbers, :] + + # switch axes around + default_order = 'fac' + if order != default_order: + try: + newidx = [default_order.index(i) for i in order] + except ValueError: + raise ValueError(f"Unrecognized order key in {order}, " + "must be permutation of 'fac'") + + try: + coordinates = da.moveaxis(coordinates, newidx, [0, 1, 2]) + except ValueError: + errmsg = ("Repeated or missing keys passed to argument " + f"`order`: {order}, each key must be used once") + raise ValueError(errmsg) + + f = units.get_conversion_factor('length', + self.units['length'], 'Angstrom') + coordinates *= f + + return coordinates + + @property def has_positions(self): """``True`` if 'position' group is in trajectory.""" diff --git a/tmp/env.yaml b/tmp/env.yaml new file mode 100644 index 0000000000..cf6237e492 --- /dev/null +++ b/tmp/env.yaml @@ -0,0 +1,43 @@ +name: dask-timeseries-dev +channels: + - defaults + - conda-forge +dependencies: + - chemfiles>=0.10 + - codecov + - cython + - dask + - docutils + - fasteners + - griddataformats + - gsd + - h5py>=2.10 + - hypothesis + - ipykernel + - joblib>=0.12 + - mdanalysis-sphinx-theme >=1.3.0 + - matplotlib>=3.2.2 + - mmtf-python + - mock + - networkx + - numpy>=1.23.2 + - pytest + - python==3.10 + - pytng>=0.2.3 + - scikit-learn + - scipy + - pip + - sphinx <7.0 + - tidynamics>=1.0.0 + - tqdm>=4.43.0 + - sphinxcontrib-bibtex + - mdaencore + - waterdynamics + - pip: + - mdahole2 + - pathsimanalysis + - duecredit + - parmed + - sphinx-sitemap + - packaging + - pyedr>=0.7.0 diff --git a/tmp/lazyts.ipynb b/tmp/lazyts.ipynb new file mode 100644 index 0000000000..f017496e09 --- /dev/null +++ b/tmp/lazyts.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Universe" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/nfs/homes3/ljwoods2/workspace/mdanalysis/package/MDAnalysis/topology/PDBParser.py:348: UserWarning: Unknown element Z found for some atoms. These have been given an empty element record. If needed they can be guessed using MDAnalysis.topology.guessers.\n", + " warnings.warn(wmsg)\n", + "/nfs/homes3/ljwoods2/workspace/mdanalysis/package/MDAnalysis/topology/PDBParser.py:348: UserWarning: Unknown element D found for some atoms. These have been given an empty element record. If needed they can be guessed using MDAnalysis.topology.guessers.\n", + " warnings.warn(wmsg)\n", + "/nfs/homes3/ljwoods2/workspace/mdanalysis/package/MDAnalysis/topology/guessers.py:146: UserWarning: Failed to guess the mass for the following atom types: D\n", + " warnings.warn(\"Failed to guess the mass for the following atom types: {}\".format(atom_type))\n", + "/nfs/homes3/ljwoods2/workspace/mdanalysis/package/MDAnalysis/topology/guessers.py:146: UserWarning: Failed to guess the mass for the following atom types: Z\n", + " warnings.warn(\"Failed to guess the mass for the following atom types: {}\".format(atom_type))\n" + ] + } + ], + "source": [ + "import MDAnalysis as mda\n", + "\n", + "TOPOL = \"/scratch/ljwoods2/workspace/zarrtraj/zarrtraj/data/yiip_equilibrium/YiiP_system.pdb\"\n", + "TRAJ = \"/scratch/ljwoods2/workspace/zarrtraj/zarrtraj/data/yiip_aligned_compressed.h5md\"\n", + "\n", + "u = mda.Universe(TOPOL, TRAJ)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run dask RMSF method" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DaskRMSF execution time: 36.913174867630005\n" + ] + } + ], + "source": [ + "from MDAnalysis.analysis.dasktimeseries import DaskRMSF\n", + "import time\n", + "\n", + "start = time.time()\n", + "dask_timeseries = u.trajectory.dask_timeseries()\n", + "dRMSF = DaskRMSF(dask_timeseries).run()\n", + "dRMSF_result = dRMSF.results.rmsf\n", + "stop = time.time()\n", + "\n", + "print(f\"DaskRMSF execution time: {stop - start}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run RMSF method shipped with mda" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSF execution time: 2235.1541423797607\n" + ] + } + ], + "source": [ + "from MDAnalysis.analysis.rms import RMSF\n", + "\n", + "start = time.time()\n", + "RMSF = RMSF(u.atoms).run()\n", + "RMSF_result = RMSF.results.rmsf\n", + "stop = time.time()\n", + "\n", + "print(f\"RMSF execution time: {stop - start}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ensure results are the same" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from numpy.testing import assert_allclose\n", + "\n", + "assert_allclose(dRMSF_result, RMSF_result, atol=1e-5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print num cores used by dask scheduler" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "24" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "import multiprocessing\n", + "multiprocessing.cpu_count()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Settings used to write the trajectory used above" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import MDAnalysis as mda\n", + "import MDAnalysisData\n", + "from MDAnalysis.analysis import align\n", + "\n", + "yiip = MDAnalysisData.yiip_equilibrium.fetch_yiip_equilibrium_long()\n", + "\n", + "u = mda.Universe(\n", + " yiip.topology,\n", + " yiip.trajectory\n", + ")\n", + "\n", + "average = align.AverageStructure(\n", + " u, u, select=\"protein and name CA\", ref_frame=0\n", + ").run()\n", + "ref = average.results.universe\n", + "\n", + "# Writer kwargs only passable to align in mda 2.8.0\n", + "# to relax requirement, just write in two steps\n", + "\n", + "# 1. Align traj and write to xtc\n", + "aligner = align.AlignTraj(\n", + " u,\n", + " ref,\n", + " select=\"protein and name CA\",\n", + " filename=\"/scratch/ljwoods2/workspace/zarrtraj/data/yiip_equilibrium/YiiP_system_90ns_center_aligned.xtc\",\n", + ").run()\n", + "\n", + "# 2. Write aligned xtc traj to H5MD\n", + "u_aligned = mda.Universe(\n", + " yiip.topology,\n", + " \"/scratch/ljwoods2/workspace/zarrtraj/data/yiip_equilibrium/YiiP_system_90ns_center_aligned.xtc\"\n", + ")\n", + "\n", + "with mda.Writer(\n", + " \"/scratch/ljwoods2/workspace/zarrtraj/zarrtraj/data/yiip_aligned_compressed.h5md\",\n", + " n_atoms=u_aligned.trajectory.n_atoms,\n", + " n_frames=u_aligned.trajectory.n_frames,\n", + " compression=\"gzip\",\n", + " compression_opts=9,\n", + " chunks=(9, u.trajectory.n_atoms, 3),\n", + ") as W:\n", + " for ts in u_aligned.trajectory:\n", + " W.write(u_aligned.atoms)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mda-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}