Skip to content

Commit

Permalink
Merge pull request #121 from LSSTDESC/issue/120/iterator
Browse files Browse the repository at this point in the history
Issue/120/iterator
  • Loading branch information
sschmidt23 authored Oct 25, 2022
2 parents c25b81b + 122e507 commit 41a477b
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 1 deletion.
107 changes: 107 additions & 0 deletions nb/iterator_demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "94dcfc9a",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import os\n",
"import qp"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d479510b",
"metadata": {},
"outputs": [],
"source": [
"QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))\n",
"data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4d493280",
"metadata": {},
"outputs": [],
"source": [
"ens = qp.read(data_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2928f6b",
"metadata": {},
"outputs": [],
"source": [
"itr = qp.iterator(data_file, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bef60c76",
"metadata": {},
"outputs": [],
"source": [
"ens.npdf"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d908b02b",
"metadata": {},
"outputs": [],
"source": [
"test_vals = np.linspace(0., 1., 11)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d00c68fe",
"metadata": {},
"outputs": [],
"source": [
"for start, end, ens_i in itr:\n",
" print(start, end, np.max(ens[start:end].pdf(test_vals) - ens_i.pdf(test_vals)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d9d7b748",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 1 addition & 1 deletion src/qp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .sparse_pdf import *
from .scipy_pdfs import *
from .ensemble import Ensemble
from .factory import instance, add_class, create, read, convert, concatenate
from .factory import instance, add_class, create, read, convert, concatenate, iterator

from . import utils

Expand Down
Binary file added src/qp/data/test.hdf5
Binary file not shown.
43 changes: 43 additions & 0 deletions src/qp/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,48 @@ def read(self, filename):
return Ensemble(ctor_func, data=data, ancil=ancil_table)


def iterator(self, filename, chunk_size=100_000, rank=0, parallel_size=1):
"""Return an iterator for chunked read
Parameters
----------
filename : `str`
chunk_size : `int`
"""
extension = os.path.splitext(filename)[1]
if extension not in ['.hdf5']: #pragma: no cover
raise TypeError("Can only use qp.iterator on hdf5 files")

metadata = io.readHdf5ToDict(filename, 'meta')
pdf_name = metadata.pop('pdf_name')[0].decode()
pdf_version = metadata.pop('pdf_version')[0]
if pdf_name not in self: #pragma: no cover
raise KeyError("Class nameed %s is not in factory" % pdf_name)
the_class = self[pdf_name]
reader_convert = the_class.reader_method(pdf_version)
ctor_func = the_class.creation_method(None)

f, infp = io.readHdf5Group(filename, 'data')
try:
ancil_f, ancil_infp = io.readHdf5Group(filename, 'data')
except KeyError: #pragma: no cover
ancil_f, ancil_infp = (None, None)
num_rows = io.getGroupInputDataLength(f)
ranges = io.data_ranges_by_rank(num_rows, chunk_size, parallel_size, rank)
data = OrderedDict()
ancil_data = OrderedDict()
for start, end in ranges:
for key, val in f.items():
data[key] = io.readHdf5DatasetToArray(val, start, end)
if ancil_f is not None:
for key, val in ancil_f.items():
ancil_data[key] = io.readHdf5DatasetToArray(val, start, end)
yield start, end, Ensemble(ctor_func, data=data, ancil=ancil_data)
infp.close()
if ancil_infp is not None:
ancil_infp.close()

def convert(self, in_dist, class_name, **kwds):
"""Read an ensemble to a different repersenation
Expand Down Expand Up @@ -262,5 +304,6 @@ def instance():
add_class = _FACTORY.add_class
create = _FACTORY.create
read = _FACTORY.read
iterator = _FACTORY.iterator
convert = _FACTORY.convert
concatenate = _FACTORY.concatenate
12 changes: 12 additions & 0 deletions tests/qp/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,18 @@ def test_interp(self):
assert isinstance(ens_i.gen_obj, qp.interp_gen)
self._run_ensemble_funcs(ens_i, cls_test_data['test_xvals'])

def test_iterator(self):
""" Test the iterated read """
QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))
data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')
ens = qp.read(data_file)
itr = qp.iterator(data_file, 10)
test_grid = np.linspace(0., 1., 11)
for start, end, ens_i in itr:
check_vals = ens[start:end].pdf(test_grid)
test_vals = ens_i.pdf(test_grid)
assert np.allclose(check_vals, test_vals)


if __name__ == '__main__':
unittest.main()

0 comments on commit 41a477b

Please sign in to comment.