Merge pull request #121 from LSSTDESC/issue/120/iterator

Issue/120/iterator
LSSTDESC · Oct 25, 2022 · 41a477b · 41a477b
2 parents c25b81b + 122e507
commit 41a477b
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 1 deletion.
diff --git a/nb/iterator_demo.ipynb b/nb/iterator_demo.ipynb
@@ -0,0 +1,107 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94dcfc9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os\n",
+    "import qp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d479510b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))\n",
+    "data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d493280",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ens = qp.read(data_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2928f6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "itr = qp.iterator(data_file, 10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bef60c76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ens.npdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d908b02b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_vals = np.linspace(0., 1., 11)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d00c68fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for start, end, ens_i in itr:\n",
+    "    print(start, end, np.max(ens[start:end].pdf(test_vals) - ens_i.pdf(test_vals)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9d7b748",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/qp/__init__.py b/src/qp/__init__.py
@@ -11,7 +11,7 @@
 from .sparse_pdf import *
 from .scipy_pdfs import *
 from .ensemble import Ensemble
-from .factory import instance, add_class, create, read, convert, concatenate
+from .factory import instance, add_class, create, read, convert, concatenate, iterator
 
 from . import utils
 

diff --git a/src/qp/data/test.hdf5 b/src/qp/data/test.hdf5
diff --git a/src/qp/factory.py b/src/qp/factory.py
@@ -165,6 +165,48 @@ def read(self, filename):
         return Ensemble(ctor_func, data=data, ancil=ancil_table)
 
 
+    def iterator(self, filename, chunk_size=100_000, rank=0, parallel_size=1):
+        """Return an iterator for chunked read
+
+        Parameters
+        ----------
+        filename : `str`
+
+        chunk_size : `int`
+        """
+        extension = os.path.splitext(filename)[1]
+        if extension not in ['.hdf5']:  #pragma: no cover
+            raise TypeError("Can only use qp.iterator on hdf5 files")
+
+        metadata = io.readHdf5ToDict(filename, 'meta')
+        pdf_name = metadata.pop('pdf_name')[0].decode()
+        pdf_version = metadata.pop('pdf_version')[0]
+        if pdf_name not in self: #pragma: no cover
+            raise KeyError("Class nameed %s is not in factory" % pdf_name)
+        the_class = self[pdf_name]
+        reader_convert = the_class.reader_method(pdf_version)
+        ctor_func = the_class.creation_method(None)
+
+        f, infp = io.readHdf5Group(filename, 'data')
+        try:
+            ancil_f, ancil_infp = io.readHdf5Group(filename, 'data')
+        except KeyError:  #pragma: no cover
+            ancil_f, ancil_infp = (None, None)
+        num_rows = io.getGroupInputDataLength(f)
+        ranges = io.data_ranges_by_rank(num_rows, chunk_size, parallel_size, rank)
+        data = OrderedDict()
+        ancil_data = OrderedDict()
+        for start, end in ranges:
+            for key, val in f.items():
+                data[key] = io.readHdf5DatasetToArray(val, start, end)
+            if ancil_f is not None:
+                for key, val in ancil_f.items():
+                    ancil_data[key] = io.readHdf5DatasetToArray(val, start, end)
+            yield start, end, Ensemble(ctor_func, data=data, ancil=ancil_data)
+        infp.close()
+        if ancil_infp is not None:
+            ancil_infp.close()
+
     def convert(self, in_dist, class_name, **kwds):
         """Read an ensemble to a different repersenation
 
@@ -262,5 +304,6 @@ def instance():
 add_class = _FACTORY.add_class
 create = _FACTORY.create
 read = _FACTORY.read
+iterator = _FACTORY.iterator
 convert = _FACTORY.convert
 concatenate = _FACTORY.concatenate
diff --git a/tests/qp/test_ensemble.py b/tests/qp/test_ensemble.py
@@ -187,6 +187,18 @@ def test_interp(self):
         assert isinstance(ens_i.gen_obj, qp.interp_gen)
         self._run_ensemble_funcs(ens_i, cls_test_data['test_xvals'])
 
+    def test_iterator(self):
+        """ Test the iterated read """
+        QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))
+        data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')
+        ens = qp.read(data_file)
+        itr = qp.iterator(data_file, 10)
+        test_grid = np.linspace(0., 1., 11)
+        for start, end, ens_i in itr:
+            check_vals = ens[start:end].pdf(test_grid)
+            test_vals = ens_i.pdf(test_grid)
+            assert np.allclose(check_vals, test_vals)
+
 
 if __name__ == '__main__':
     unittest.main()