From c0c92e952ec7e14e02c479fe50629eead7bcd38c Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Thu, 8 Jun 2023 15:45:40 -0400 Subject: [PATCH 01/12] revising getting started documentation, aside from reorg-related updates --- docs/source/citing.rst | 49 +++++++++++++++------------------- docs/source/contributing.rst | 25 +++++++++++------ docs/source/futureplans.rst | 24 ----------------- docs/source/immediateplans.rst | 34 ----------------------- docs/source/index_body.rst | 23 +++++----------- docs/source/overview.rst | 47 +++++++++++++++++++++++--------- 6 files changed, 81 insertions(+), 121 deletions(-) delete mode 100644 docs/source/futureplans.rst delete mode 100644 docs/source/immediateplans.rst diff --git a/docs/source/citing.rst b/docs/source/citing.rst index 99fca1f5..494270b4 100644 --- a/docs/source/citing.rst +++ b/docs/source/citing.rst @@ -2,32 +2,13 @@ Citing RAIL *********** -This code, while public on GitHub, has not yet been released by -DESC and is still under active development. Our release of v1.0 will -be accompanied by a journal paper describing the development and -validation of `RAIL`. +RAIL is open source and may be used according to the terms of its `LICENSE `_ `(BSD 3-Clause) `_. +If you make use of the ideas or software here in any publication, you must cite this repository as "LSST-DESC PZ WG (in prep)" with the `Zenodo DOI `_. +Please consider also inviting the developers as co-authors on publications resulting from your use of RAIL by `making an issue `_. -If you make use of the ideas or software here, please cite this -repository ``https://github.com/LSSTDESC/RAIL``. You are welcome to -re-use the code, which is open source and available under terms -consistent with our `LICENSE -`_ (`BSD 3-Clause -`_). +The following list provides the necessary references for external codes accessible through the RAIL ecosystem, which must be cited as follows if those methods are used in a publication: -External contributors and DESC members wishing to use RAIL for non-DESC projects -should consult with the Photometric Redshifts (PZ) Working Group conveners, -ideally before the work has started, but definitely before any publication or -posting of the work to the arXiv. - -*********************************** -Citing individual codes within RAIL -*********************************** - -Several of the codes included in RAIL are pre-existing codes written by external developers, -if these codes are used in an analysis they should be cited individually in addition to the -citation to RAIL. - -Code references: +| Astropy: | Bayesian Photometric Redshifts (BPZ/BPZ_lite): | `Benitez (2000) `_ @@ -36,14 +17,28 @@ Code references: | Delight: | `Leistedt & Hogg (2017) `_ +| DSPS: + | FlexZBoost: -| `Izbicki & Lee (2017) -`_ +| `Izbicki & Lee (2017) `_ | `Dalmasso et al (2020) `_ +| FSPS: + +| GPz: + | PZFlowPDF: -| J. F. Crenshaw (in prep) +| J. F. Crenshaw et al (in prep) | `Zenodo link `_ +| Scikit-learn: + +| SOM(oclu): + | trainZ: | `Schmidt, Malz et al (2020) `_ + +| varInference: + +*fill in more of these* + diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 5a186a38..fa2add9f 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -2,14 +2,21 @@ Contributing ************ +RAIL is developed publicly on GitHub and welcomes all interested developers, regardless of DESC membership or LSST data rights. +The best way to get involved is to comment on `Issues `_ and `make Pull Requests `_ for your contributions. + +Professional astronomers (including students!) based in the US, Chile, or a French IN2P3 institution are encouraged to `join the LSST-DESC `_ to gain access to the `\#desc-pz-rail `_ channel on the LSSTC Slack workspace. +Those without data rights who wish to gain access to the Slack channel should `create an Issue `_ to request that the team leads initiate the process for adding a DESC External Collaborator. + Where to contribute: RAIL packages ================================== -Similar to the installation process, depending on how you want to contribute to RAIL you will be contributing to one or more of the RAIL packages. Given the package structure we imagine three main use cases for contributions: +Similar to the installation process, depending on how you want to contribute to RAIL, you will be contributing to one or more of the RAIL packages. +Given the package structure we imagine three main use cases for contributions: -1. If you are contributing to the core code base, or developing an algorithm that has minimal dependencies, you will probably only be contributing to RAIL, and only need to install the source code for RAIL. -2. If you are contritubing a new algorithm that does depend on a number of other packages beyond numpy, scipy and sklearn, you will probably be making a new rail_ package, and eventually adding to the dependencies in rail_hub. -3. If you are using existing algorithms to do studies and build analysis pipelines to do those studies, you will probably only be contriubting to rail_pipelines. +1. To contribute to the core codebase, including algorithms with no special dependencies, install RAIL from source, indicate what you aim to do in an Issue, and follow the Contribution workflow below. +2. To contribute a new algorithm or engine that depends on packages beyond numpy and scipy, you will probably be making a new rail_ repository, and eventually rail_hub. +3. To contribute analysis pipelines you built with RAIL Stages, clone `rail_pipelines` from source and follow the Contribution workflow instructions below. @@ -21,8 +28,7 @@ When you identify something that should be done, `make an issue `_ to work on, assign yourself, and leave a comment on the issue's discussion page to let others know you're working on it. @@ -42,9 +48,12 @@ As regards `full coverage`, the automatic tests will require that 100% of the li When you're ready to merge your branch into the `main` branch, `make a pull request `_, and request that other team members review it if you have any in mind, for example, those who have consulted on some of the work. -Once the changes have been approved, you can merge and squash the pull request as well as close its corresponding issue by putting `closes #[#]` in the comment closing the pull request. +Once the changes have been approved, 1. select "Squash and merge" on the approved pull request, 2. enter `closes #[#]` in the comment field to close the resolved issue, and 3. delete your branch using the button on the merged pull request. -To review a pull request, it's a good idea to start by pulling the changes and running the unit tests (see above). If there are no problems with that, you can make suggestions for optional improvements (e.g. adding a one-line comment before a clever block of code or including a demonstration of new functionality in the example notebooks) or request necessary changes (e.g. including an exception for an edge case that will break the code or separating out code that's repeated in multiple places). +To review a pull request, it's a good idea to start by pulling the changes and running the unit tests (see above). +Check the code for complete and accurate docstrings, sufficient comments, and to ensure any instances of `#pragma: no cover` (excluding the code from unit test coverage accounting) are extremely well-justified. +Necessary changes to request may include, e.g. writing an exception for an edge case that will break the code, separating out code that's repeated in multiple places, etc. +You may also make suggestions for optional improvements, such as adding a one-line comment before a clever block of code or including a demonstration of new functionality in the example notebooks. diff --git a/docs/source/futureplans.rst b/docs/source/futureplans.rst deleted file mode 100644 index 3ae6c677..00000000 --- a/docs/source/futureplans.rst +++ /dev/null @@ -1,24 +0,0 @@ -************ -Future plans -************ - -RAIL's design aims to break up the PZ WG's pipeline responsibilities into smaller milestones that can be accomplished by individuals or small groups on short timescales, i.e. under a year. -The next stages of RAIL development (tentative project codenames subject to change) are intended to be paper-able projects, each of which addresses one or more SRM deliverables by incrementally -advancing the code along the way to project completion. They are scoped such that any can be executed in any order or even simultaneously. - -* *RAILyard*: Assess the performance of template-fitting codes by extending the creation subpackage to forward model templates. - -* *RAIL network*: Assess the performance of clustering-redshift methods by extending the creation subpackage to forward model positions. - -* *Off the RAILs*: Investigate the effects of erroneous spectroscopic redshifts (or uncertain narrow-band photo-zs) in a training set by extending the creation subpackage's imperfect prior model. - -* *Third RAIL*: Investigate the effects of imperfect deblending on estimated photo-z posteriors by extending the creation subpackage to forward model the effect of imperfect deblending. - -* *RAIL gauge*: Investigate the impact of measurement errors (PSF, aperture photometry, flux calibration, etc.) on estimated photo-z posteriors by including their effects in the the forward model of the creation subpackage. - -* *DERAIL*: Investigate the impact of imperfect photo-z posterior estimation on a probe-specific (e.g. :math:`3\times2{\rm pt}`) cosmological parameter constraint by connecting the estimation subpackage to other DESC pipelines. - -* *RAIL line*: Assess the sensitivity of estimated photo-z posteriors to photometry impacted by emission lines by extending the creation subpackage's forward model. - -Informal library of fun train-themed names for future projects/pipelines built with RAIL include: -`monoRAIL`, `tRAILblazer`, `tRAILmix`, `tRAILer`. diff --git a/docs/source/immediateplans.rst b/docs/source/immediateplans.rst deleted file mode 100644 index 00511976..00000000 --- a/docs/source/immediateplans.rst +++ /dev/null @@ -1,34 +0,0 @@ -*************** -Immediate plans -*************** - -This repo is home to a series of LSST-DESC projects aiming to quantify the impact of imperfect prior information on probabilistic redshift estimation. -An outline of the baseline RAIL is illustrated `here `_. - -1. *Golden Spike*: Build the basic infrastructure for controlled experiments of forward-modeled photo-z posteriors -================================================================================================================== - -* a `rail.creation` subpackage that can generate true photo-z posteriors and mock photometry. - -* a `rail.estimation` subpackage with a superclass for photo-z posterior estimation routines and at least one subclass template example implementing the trainZ (experimental control) algorithm. - -* a `rail.evaluation` subpackage that calculates at least the metrics from the `PZ DC1 Paper `_ for estimated photo-z posteriors relative to the true photo-z posteriors. - -* documented scripts that demonstrate the use of RAIL in a DC1-like experiment on NERSC. - -* sufficient documentation for a v1.0 release. - -* an LSST-DESC Note presenting the RAIL infrastructure. - -2. *RAILroad*: Quantify the impact of nonrepresentativity (imbalance and incompleteness) of a training set on estimated photo-z posteriors by multiple machine learning methods -=============================================================================================================================================================================== - -* parameter specifications for degrading an existing `Creator` to make an imperfect prior of the form of nonrepresentativity into the observed photometry. - -* at least two `Estimator` wrapped machine learning-based codes for estimating photo-z posteriors. - -* additional `Evaluator` metrics with feed-through access to the `qp `_ metrics. - -* end-to-end documented scripts that demonstrate a blinded experiment on NERSC. - -* an LSST-DESC paper presenting the results of the experiment. diff --git a/docs/source/index_body.rst b/docs/source/index_body.rst index 3feaa57d..72d61643 100644 --- a/docs/source/index_body.rst +++ b/docs/source/index_body.rst @@ -2,32 +2,23 @@ RAIL: Redshift Assessment Infrastructure Layers =============================================== -The LSST-DESC Redshift Assessment Infrastructure Layer (RAIL) code is a framework to perform photometric redshift (PZ) estimation and analysis for DESC. +RAIL is a flexible software library providing tools to produce at-scale photometric redshift data products, including uncertainties and summary statistics, and stress-test them under realistically complex systematics. -RAIL's purpose is to be the infrastructure enabling the PZ working group deliverables in `the LSST-DESC Science Roadmap (see Sec. 4.14) `_, -aiming to guide the selection and implementation of redshift estimators in DESC pipelines. -RAIL differs from previous plans for PZ pipeline infrastructure in that it is broken into stages, -each corresponding to a manageable unit of infrastructure advancement, a specific question to answer with that code, and a guaranteed publication opportunity. -RAIL uses `qp `_ as a back-end for handling univariate probability density functions (PDFs) such as photo-z posteriors or :math:`n(z)` samples. +RAIL serves as the infrastructure supporting many extragalactic applications of `the Legacy Survey of Space and Time (LSST) `_ on `the Vera C. Rubin Observatory`_, including Rubin-wide commissioning activities. +RAIL was initiated by the Photometric Redshifts (PZ) Working Group (WG) of the `LSST Dark Energy Science Collaboration (DESC) `_ as a result of the lessons learned from the `Data Challenge 1 (DC1) experiment `_ to enable the PZ WG Deliverables in the `LSST-DESC Science Roadmap (see Sec. 5.18) `_, aiming to guide the selection and implementation of redshift estimators in DESC analysis pipelines. -The RAIL source code is publically available at https://github.com/LSSTDESC/RAIL. +RAIL is developed and maintained by a diverse team comprising DESC Pipeline Scientists (PSs), international in-kind contributors, LSST Interdisciplinary Collaboration for Computing (LINCC) Frameworks software engineers, and other volunteers, but all are welcome to join the team regardless of LSST data rights. +To get involved, chime in on the issues in any of the RAIL repositories described in the Overview section. .. toctree:: :maxdepth: 1 :caption: Getting Started - source/overview source/installation - source/contributing source/citing - -.. toctree:: - :maxdepth: 1 - :caption: RAIL Plans - - source/immediateplans - source/futureplans + source/overview + source/contributing .. toctree:: :maxdepth: 1 diff --git a/docs/source/overview.rst b/docs/source/overview.rst index 7c7d5e03..99d2c3d5 100644 --- a/docs/source/overview.rst +++ b/docs/source/overview.rst @@ -2,22 +2,21 @@ Overview ******** -RAIL (Redshift Assessment Infrastructure Layers) is LSST-DESC software for the computation and assessment of redshifts derived from Rubin data. -RAIL addresses the challenge of enabling stress-testing of multiple photo-z codes in the presence of realistically complex systematic imperfections in the input photometry and prior information (such as template libraries and training sets), everything from the handling of diverse output storage formats to the propagation of assumptions inherent to individual codes to the architecture of the machines on which the code is run. -RAIL seeks to minimize such impacts by unifying much of the infrastructure in a single modular code base that can be used by photo-z developers and consumers alike. -Beyond the comparison of different photo-z approaches, RAIL will be employed to generate photo-z catalogs used by DESC members in their science analyses. +RAIL enables stress-testing of multiple approaches to photo-z estimation at-scale for LSST in the presence of realistically complex systematic imperfections in the input photometry and prior information (such as template libraries and training sets) under science-agnostic and science-specific metrics, with the expectation that once a pipeline is validated through controlled experimentation, the exact same estimation procedure could be applied to real data without loss of validity. +To support such an ambitious goal, it has a highly modular structure encompassing three aspects of this kind of experiment and is built upon a number of key types of objects, however, the result is that RAIL is unavoidably complicated. +This overview seeks to present the organizational philosophy, basic structures, and core dependencies in order to motivate an exposition to the RAIL ecosystem. -There are three aspects to the RAIL approach to photo-zs: the creation of self-consistently forward-modeled, realistically complex mock data for testing purposes, the estimation of individual galaxy and galaxy sample redshift uncertainties, and the evaluation of the results of the estimators by generic and science-specific metrics. -RAIL includes a subpackage for each, providing a flexible framework for handling different approaches and at least one baseline implementation of a method under that umbrella as an example for the broader community to integrate their own codes into RAIL. -The purpose of each piece of infrastructure is outlined below. -For a working example illustrating all four components of RAIL, see the `examples/goldenspike/goldenspike.ipynb `_ jupyter notebook. +*This will be a lot easier to distill out of the paper draft than to write independently here.* +Organization +************ -A brief note on core DESC software dependencies -=============================================== +An end-to-end experiment entails the creation of self-consistently forward-modeled, realistically complex mock data for testing purposes, the estimation of individual galaxy and/or galaxy sample redshift uncertainties, and the evaluation of the resulting photo-z data products by informative metrics. +RAIL includes subpackages for each, providing a flexible framework for accessing implementations of approaches under each umbrella. +The purpose of each piece of infrastructure is outlined below. +For a working example illustrating all three components of RAIL, see the `examples/goldenspike/goldenspike.ipynb `_ Jupyter notebook. -The `qp` Ensemble format is the expected default storage format for redshift information within DESC, and all redshift PDFs, for both individual galaxies and galaxy samples (such as tomographic bin members or galaxy cluster members), will be stored as `qp` Ensemble objects to be directly accessible to LSST-DESC pipelines, such as `TXPipe `_. -The use of a unified `qp` Ensemble as the output format enables a consistent evaluation of redshift uncertainties. See `the qp repository `_ for more details, though in brief, `qp` enables transformation between different PDF parameterizations, computation of many useful metrics, and easy fileIO. +*needs significant revision/paring-down* `creation` ========== @@ -95,3 +94,27 @@ In the `example` directory, you can execute the evaluation/demo.ipynb jupyter no We aim to greatly expand the library of available metrics and welcome input from the community in doing so. An immediate extension would propagate estimated redshift posteriors to science-motivated metrics, and/or metrics related to computational requirements of the estimators. Within DESC, development of sophisticated metrics propagating photo-z uncertainties through cosmological probe analysis pipelines is now underway as part of Dark Energy Redshift Assessment Infrastructure Layers (DERAIL). + +Structure and core dependencies +******************************* + +"stages" +"pipelines" +"datastore" +"datahandle" +qp + +A brief note on core DESC software dependencies +=============================================== + +The `qp` Ensemble format is the expected default storage format for redshift information within DESC, and all redshift PDFs, for both individual galaxies and galaxy samples (such as tomographic bin members or galaxy cluster members), will be stored as `qp` Ensemble objects to be directly accessible to LSST-DESC pipelines, such as `TXPipe `_. +The use of a unified `qp` Ensemble as the output format enables a consistent evaluation of redshift uncertainties. See `the qp repository `_ for more details, though in brief, `qp` enables transformation between different PDF parameterizations, computation of many useful metrics, and easy fileIO. + +The RAIL ecosystem +****************** + +`pz-rail-ties` contains RAIL's base classes and dependency-light methods as creators, estimators, and evaluators, whereas the nontrivial engines and algorithms are developed in standalone repositories, in order to reduce the risk that development will be interrupted by temporary issues with any one wrapped method. +All other packages in the RAIL ecosystem automatically include `pz-rail-ties`, and we don't recommend installing it on its own because it doesn't contain the real methods, but it is automatically included with all other packages in the RAIL ecosystem. +Users who know which methods they want can choose to install their RAIL-wrapped packages one at a time, but `pz-rail` includes all the available methods in the RAIL ecosystem. + +`pz-rail-pipelines` is a community-driven repository of pipelines built with From 0e85edd34dea94e84945bef8071f92bc01c3dfb3 Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 12:37:10 -0400 Subject: [PATCH 02/12] renaming for trainZ --- examples/goldenspike_examples/goldenspike.yml | 4 ++-- src/rail/estimation/algos/trainZ.py | 11 +++++++---- tests/estimation/test_algos.py | 4 ++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/examples/goldenspike_examples/goldenspike.yml b/examples/goldenspike_examples/goldenspike.yml index 42360c15..37857b0e 100644 --- a/examples/goldenspike_examples/goldenspike.yml +++ b/examples/goldenspike_examples/goldenspike.yml @@ -42,13 +42,13 @@ stages: - classname: QuantityCut name: quantity_cut nprocess: 1 -- classname: Inform_trainZ +- classname: InformTrainZ name: inform_trainZ nprocess: 1 - classname: Estimator name: estimate_bpz nprocess: 1 -- classname: TrainZ +- classname: EstimateTrainZ name: estimate_trainZ nprocess: 1 - classname: RandomPZ diff --git a/src/rail/estimation/algos/trainZ.py b/src/rail/estimation/algos/trainZ.py index a6fbe145..47b6bcef 100644 --- a/src/rail/estimation/algos/trainZ.py +++ b/src/rail/estimation/algos/trainZ.py @@ -23,11 +23,11 @@ def __init__(self, zgrid, pdf, zmode): self.zmode = zmode -class Inform_trainZ(CatInformer): +class InformTrainZ(CatInformer): """Train an Estimator which returns a global PDF for all galaxies """ - name = 'Inform_trainZ' + name = 'InformTrainZ' config_options = CatInformer.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, @@ -56,11 +56,11 @@ def run(self): self.add_data('model', self.model) -class TrainZ(CatEstimator): +class EstimateTrainZ(CatEstimator): """CatEstimator which returns a global PDF for all galaxies """ - name = 'TrainZ' + name = 'EstimateTrainZ' config_options = CatEstimator.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, @@ -87,3 +87,6 @@ def _process_chunk(self, start, end, data, first): data=dict(xvals=self.zgrid, yvals=np.tile(self.train_pdf, (test_size, 1)))) qp_d.set_ancil(dict(zmode=zmode)) self._do_chunk_output(qp_d, start, end, first) + + +# TODO: add summarizer for trainZ \ No newline at end of file diff --git a/tests/estimation/test_algos.py b/tests/estimation/test_algos.py index ff10afaf..87596d3e 100644 --- a/tests/estimation/test_algos.py +++ b/tests/estimation/test_algos.py @@ -124,8 +124,8 @@ def test_train_pz(): zb_expected = np.repeat(0.1445183, 10) pdf_expected = np.zeros(shape=(301,)) pdf_expected[10:16] = [7, 23, 8, 23, 26, 13] - train_algo = trainZ.Inform_trainZ - pz_algo = trainZ.TrainZ + train_algo = trainZ.InformTrainZ + pz_algo = trainZ.EstimateTrainZ results, rerun_results, rerun3_results = one_algo( "TrainZ", train_algo, pz_algo, train_config_dict, estim_config_dict ) From 79fd8b81193986c18edf9556dbbbb527bdc0cc49 Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 12:38:27 -0400 Subject: [PATCH 03/12] todos for documentation needs --- src/rail/estimation/algos/README.md | 12 ++- src/rail/estimation/estimator.py | 113 ++++++++++++-------------- src/rail/estimation/informer.py | 1 + src/rail/estimation/summarizer.py | 120 +++++++++++++--------------- 4 files changed, 120 insertions(+), 126 deletions(-) create mode 100644 src/rail/estimation/informer.py diff --git a/src/rail/estimation/algos/README.md b/src/rail/estimation/algos/README.md index 8472fa80..4eb7fbde 100644 --- a/src/rail/estimation/algos/README.md +++ b/src/rail/estimation/algos/README.md @@ -14,6 +14,8 @@ The usual procedure for a gridded parameterization is to define the redshift eva - `load_model`: boolean, if True codes should skip inform and load a pretrained model from the filename specified in `modelfile`. - `save_train`: boolean, if True codes should save the model as computed during the running of `inform` to the filename in `modelfile`. +TODO: move these detailed explanations to docstring of each algo so they show up in the API documentation! + # BPZ_Lite @@ -130,7 +132,9 @@ randomPZ is not a real photo-z code, it is a placeholder demo code used to demon - `rand_zmax`: maximum redshift for grid # simpleNN -Another "demo" photo-z algorithm, this subclass uses sklearn's neural_network to create a simple point estimate for redshift, and outputs a Gaussian redshift estimate based soley on an ad-hoc `width` parameter specified by the user. It is *not* a fully functional code, and should again be though of more for demonstration. In the future we will implement a more sophisticated NN-based photo-z and likely remove this demo. +Another "demo" photo-z algorithm, this subclass uses sklearn's neural_network to create a simple point estimate for redshift, and outputs a Gaussian redshift estimate based soley on an ad-hoc `width` parameter specified by the user. +It is *not* a fully functional code, and should again be though of more for demonstration. +In the future we will implement a more sophisticated NN-based photo-z and likely remove this demo. - `width`: width of the PDFs, where the output Gaussian will be assigned a width of width*(1+zpoint). @@ -139,4 +143,8 @@ Another "demo" photo-z algorithm, this subclass uses sklearn's neural_network to - `bands`: string of the single-letter filter names, e.g. "ugrizy", again used by TXPipe. # trainZ -trainZ is our "pathological" photo-z estimator, it calculates the N(z) histogram of the training data, normalizes this, and outputs this as a redshift estimate for each galaxy in the test sample. No parameters beyond the zmin, zmax, nzbins, and inform options are necessary to run the code. As every PDF will be identical, running this estimator for a large number of objects can be a waste of space, and you might want to consider just storing the normalized N(z) separately. +trainZ is our "pathological" photo-z estimator, so-called because it returns a normalized histogram of the training set as the estimated photo-z PDF of every galaxy in the test set, i.e. it does not account for the test set data whatsoever. +No parameters beyond the zmin, zmax, nzbins, and inform options are necessary to run the code. +As every PDF will be identical, running this estimator for a large number of objects can be a waste of space, and you might want to consider just storing the normalized N(z) separately. + +TODO: then that should be an option? diff --git a/src/rail/estimation/estimator.py b/src/rail/estimation/estimator.py index 360b8c44..f5974b92 100644 --- a/src/rail/estimation/estimator.py +++ b/src/rail/estimation/estimator.py @@ -1,11 +1,61 @@ """ -Abstract base classes defining redshift estimations Informers and Estimators +Abstract base classes for per-galaxy photo-z PDF estimation """ from rail.core.data import TableHandle, QPHandle, ModelHandle from rail.core.stage import RailStage import gc +class CatInformer(RailStage): + """The base class for informing models used to make photo-z posterior estimates + from catalog-like inputs (i.e., tables with fluxes in photometric bands among + the set of columns). + + Estimators take as input a generic "model", the details of which depend on the sub-class. + All Estimators must have an associated Informer that produces the models given inputs such as training sets or SED template libraries with priors, hence the generic name; while "Trainer" would be accurate for data-driven estimators, "Informer" also encompases model-fitting methods ingesting prior information. + """ + + name = 'CatInformer' + config_options = RailStage.config_options.copy() + config_options.update(hdf5_groupname=str, save_train=True) + inputs = [('input', TableHandle)] + outputs = [('model', ModelHandle)] + + def __init__(self, args, comm=None): + """Initialize Informer that can inform models for redshift estimation """ + RailStage.__init__(self, args, comm=comm) + self.model = None + + def inform(self, training_data): + """The main method for Informers + + This will attach the input_data to this `Informer` + (for introspection and provenance tracking). + + Then it will call the run() and finalize() methods, which need to + be implemented by the sub-classes. + + The run() method will need to register the model that it creates to this Estimator + by using `self.add_data('model', model)`. + + Finally, this will return a ModelHandle providing access to the trained model. + + Parameters + ---------- + input_data : `dict` or `TableHandle` + dictionary of all input data, or a `TableHandle` providing access to it + + Returns + ------- + model : ModelHandle + Handle providing access to trained model + """ + self.set_data('input', training_data) + self.run() + self.finalize() + return self.get_handle('model') + + class CatEstimator(RailStage): """The base class for making photo-z posterior estimates from catalog-like inputs (i.e., tables with fluxes in photometric bands among the set of columns) @@ -119,63 +169,4 @@ def _do_chunk_output(self, qp_dstn, start, end, first): self._output_handle = self.add_handle('output', data = qp_dstn) self._output_handle.initialize_write(self._input_length, communicator = self.comm) self._output_handle.set_data(qp_dstn, partial=True) - self._output_handle.write_chunk(start, end) - - - -class CatInformer(RailStage): - """The base class for informing models used to make photo-z posterior estimates - from catalog-like inputs (i.e., tables with fluxes in photometric bands among - the set of columns). - - Estimators use a generic "model", the details of which depends on the sub-class. - Most estimators will have associated Informer classes, which can be used to inform - those models. - - (Note, "Inform" is more generic than "Train" as it also applies to algorithms that - are template-based rather than machine learning-based.) - - Informer will produce as output a generic "model", the details of which depends on the sub-class. - - They take as "input" catalog-like tabular data, which is used to "inform" the model. - """ - - name = 'Informer' - config_options = RailStage.config_options.copy() - config_options.update(hdf5_groupname=str, save_train=True) - inputs = [('input', TableHandle)] - outputs = [('model', ModelHandle)] - - def __init__(self, args, comm=None): - """Initialize Informer that can inform models for redshift estimation """ - RailStage.__init__(self, args, comm=comm) - self.model = None - - def inform(self, training_data): - """The main interface method for Informers - - This will attach the input_data to this `Informer` - (for introspection and provenance tracking). - - Then it will call the run() and finalize() methods, which need to - be implemented by the sub-classes. - - The run() method will need to register the model that it creates to this Estimator - by using `self.add_data('model', model)`. - - Finally, this will return a ModelHandle providing access to the trained model. - - Parameters - ---------- - input_data : `dict` or `TableHandle` - dictionary of all input data, or a `TableHandle` providing access to it - - Returns - ------- - model : ModelHandle - Handle providing access to trained model - """ - self.set_data('input', training_data) - self.run() - self.finalize() - return self.get_handle('model') + self._output_handle.write_chunk(start, end) \ No newline at end of file diff --git a/src/rail/estimation/informer.py b/src/rail/estimation/informer.py new file mode 100644 index 00000000..4c018944 --- /dev/null +++ b/src/rail/estimation/informer.py @@ -0,0 +1 @@ +# TODO: put informers here since they only care about input types and can be used for estimators, summarizers, and tomographers \ No newline at end of file diff --git a/src/rail/estimation/summarizer.py b/src/rail/estimation/summarizer.py index 39c1fd68..dc025093 100644 --- a/src/rail/estimation/summarizer.py +++ b/src/rail/estimation/summarizer.py @@ -1,18 +1,68 @@ """ -Abstract base classes defining redshift estimations Informers and Estimators +Abstract base classes for characterizing redshift distributions for an ensemble of galaxies. """ from rail.core.data import QPHandle, TableHandle, ModelHandle from rail.core.stage import RailStage +class PzInformer(RailStage): #pragma: no cover + """The base class for informing models used to summarize photo-z posterior estimates + from ensembles of p(z) distributions. + + Summarizers take as input a generic "model", the details of which depend on the sub-class. + All Summarizes must have an associated Informer that produces the models given inputs such as training sets or SED template libraries with priors, hence the generic name; while "Trainer" would be accurate for data-driven estimators, "Informer" also encompases model-fitting methods ingesting prior information. + + They take as "input" a qp.Ensemble of per-galaxy p(z) data, which is used to "inform" the model. + """ + + name = 'PzInformer' + config_options = RailStage.config_options.copy() + inputs = [('input', QPHandle)] + outputs = [('model', ModelHandle)] + + def __init__(self, args, comm=None): + """Initialize Informer that can inform models for redshift estimation """ + RailStage.__init__(self, args, comm=comm) + self.model = None + + def inform(self, training_data): + """The main interface method for Informers + + This will attach the input_data to this `Informer` + (for introspection and provenance tracking). + + Then it will call the run() and finalize() methods, which need to + be implemented by the sub-classes. + + The run() method will need to register the model that it creates to this Estimator + by using `self.add_data('model', model)`. + + Finally, this will return a ModelHandle providing access to the trained model. + + Parameters + ---------- + input_data : `qp.Ensemble` + Per-galaxy p(z), and any ancilary data associated with it + + Returns + ------- + model : ModelHandle + Handle providing access to trained model + """ + self.set_data('input', training_data) + self.run() + self.finalize() + return self.get_handle('model') + + class CatSummarizer(RailStage): #pragma: no cover - """The base class for classes that go from catalog-like tables - to ensemble NZ estimates. + """The base class to go from catalog-like tables + to ensemble N(z) estimates. - CatSummarizer take as "input" a catalog-like table. I.e., a + CatSummarizer takes as "input" a catalog-like table, i.e., a table with fluxes in photometric bands among the set of columns. - provide as "output" a QPEnsemble, with per-ensemble n(z). + It provides as "output" a QPEnsemble, with a single n(z) or samples thereof. """ name = 'CatSummarizer' @@ -26,8 +76,8 @@ def __init__(self, args, comm=None): RailStage.__init__(self, args, comm=comm) def summarize(self, input_data): - """The main run method for the summarization, should be implemented - in the specific subclass. + """The main method for the summarization process implemented + in the run method of each specific subclass. This will attach the input_data to this `CatSummarizer` (for introspection and provenance tracking). @@ -185,59 +235,3 @@ def summarize(self, input_data, spec_data): self.run() self.finalize() return self.get_handle('output') - - -class PzInformer(RailStage): #pragma: no cover - """The base class for informing models used to summarize photo-z posterior estimates - from ensembles of p(z) distributions. - - PzSummarizers can use a generic "model", the details of which depends on the sub-class. - Some summaer will have associated PzInformer classes, which can be used to inform - those models. - - (Note, "Inform" is more generic than "Train" as it also applies to algorithms that - are template-based rather than machine learning-based.) - - PzInformer will produce as output a generic "model", the details of which depends on the sub-class. - - They take as "input" a qp.Ensemble of per-galaxy p(z) data, which is used to "inform" the model. - """ - - name = 'Informer' - config_options = RailStage.config_options.copy() - inputs = [('input', QPHandle)] - outputs = [('model', ModelHandle)] - - def __init__(self, args, comm=None): - """Initialize Informer that can inform models for redshift estimation """ - RailStage.__init__(self, args, comm=comm) - self.model = None - - def inform(self, training_data): - """The main interface method for Informers - - This will attach the input_data to this `Informer` - (for introspection and provenance tracking). - - Then it will call the run() and finalize() methods, which need to - be implemented by the sub-classes. - - The run() method will need to register the model that it creates to this Estimator - by using `self.add_data('model', model)`. - - Finally, this will return a ModelHandle providing access to the trained model. - - Parameters - ---------- - input_data : `qp.Ensemble` - Per-galaxy p(z), and any ancilary data associated with it - - Returns - ------- - model : ModelHandle - Handle providing access to trained model - """ - self.set_data('input', training_data) - self.run() - self.finalize() - return self.get_handle('model') From 6a0a31a05ee1866683417dab204404d18bc570a9 Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 14:10:35 -0400 Subject: [PATCH 04/12] consistency for simplesom --- examples/estimation_examples/SimpleSOM_demo.ipynb | 12 ++++++------ src/rail/estimation/algos/simpleSOM.py | 8 ++++---- tests/estimation/test_som_summarizers.py | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/estimation_examples/SimpleSOM_demo.ipynb b/examples/estimation_examples/SimpleSOM_demo.ipynb index a86be6a8..6ff477f2 100644 --- a/examples/estimation_examples/SimpleSOM_demo.ipynb +++ b/examples/estimation_examples/SimpleSOM_demo.ipynb @@ -120,7 +120,7 @@ "metadata": {}, "outputs": [], "source": [ - "from rail.estimation.algos.simpleSOM import Inform_SimpleSOMSummarizer" + "from rail.estimation.algos.simpleSOM import InformSimpleSOM" ] }, { @@ -160,7 +160,7 @@ "metadata": {}, "outputs": [], "source": [ - "inform_som = Inform_SimpleSOMSummarizer.make_stage(name='inform_som', **inform_dict)" + "inform_som = InformSimpleSOM.make_stage(name='inform_som', **inform_dict)" ] }, { @@ -433,8 +433,8 @@ "metadata": {}, "outputs": [], "source": [ - "from rail.estimation.algos.simpleSOM import SimpleSOMSummarizer\n", - "som_summarizer = SimpleSOMSummarizer.make_stage(name='SOM_summarizer', **summ_dict)" + "from rail.estimation.algos.simpleSOM import SimpleSOM\n", + "som_summarizer = SummarizeSimpleSOM.make_stage(name='SOM_summarizer', **summ_dict)" ] }, { @@ -502,7 +502,7 @@ " uncovered_cell_file=\"BRIGHT_uncovered_cells.hdf5\",\n", " objid_name='id',\n", " cellid_output='BRIGHT_output_cellIDs.hdf5')\n", - "bright_summarizer = SimpleSOMSummarizer.make_stage(name='bright_summarizer', **bright_dict)" + "bright_summarizer = SummarizeSimpleSOM.make_stage(name='bright_summarizer', **bright_dict)" ] }, { @@ -711,7 +711,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.10.6" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/src/rail/estimation/algos/simpleSOM.py b/src/rail/estimation/algos/simpleSOM.py index 5b620fd8..ee642d49 100644 --- a/src/rail/estimation/algos/simpleSOM.py +++ b/src/rail/estimation/algos/simpleSOM.py @@ -31,7 +31,7 @@ def _computemagcolordata(data, ref_column_name, column_names, colusage): return coldata.T -class Inform_SimpleSOMSummarizer(CatInformer): +class InformSimpleSOM(CatInformer): """Summarizer that uses a SOM to construct a weighted sum of spec-z objects in the same SOM cell as each photometric galaxy in order to estimate the overall N(z). This is @@ -73,7 +73,7 @@ class Inform_SimpleSOMSummarizer(CatInformer): pickle file containing the `minisom` SOM object that will be used by the estimation/summarization stage """ - name = 'Inform_SimpleSOM' + name = 'InformSimpleSOM' config_options = CatInformer.config_options.copy() config_options.update(nondetect_val=SHARED_PARAMS, mag_limits=SHARED_PARAMS, @@ -130,7 +130,7 @@ def run(self): self.add_data('model', self.model) -class SimpleSOMSummarizer(SZPZSummarizer): +class SummarizeSimpleSOM(SZPZSummarizer): """Quick implementation of a SOM-based summarizer that constructs and N(z) estimate via a weighted sum of the empirical N(z) consisting of the normalized histogram @@ -192,7 +192,7 @@ class SimpleSOMSummarizer(SZPZSummarizer): qp_ens: qp Ensemble ensemble of bootstrap realizations of the estimated N(z) for the input photometric data """ - name = 'SimpleSOMSummarizer' + name = 'SummarizeSimpleSOM' config_options = SZPZSummarizer.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, diff --git a/tests/estimation/test_som_summarizers.py b/tests/estimation/test_som_summarizers.py index 4301f2ee..3f717063 100644 --- a/tests/estimation/test_som_summarizers.py +++ b/tests/estimation/test_som_summarizers.py @@ -61,8 +61,8 @@ def one_algo(key, inform_class, summarizer_class, summary_kwargs): def test_SimpleSOM(): summary_config_dict = {"m_dim": 21, "n_dim": 21, "column_usage": "colors"} - inform_class = simpleSOM.Inform_SimpleSOMSummarizer - summarizerclass = simpleSOM.SimpleSOMSummarizer + inform_class = simpleSOM.InformSimpleSOM + summarizerclass = simpleSOM.SummarizeSimpleSOM _ = one_algo("SimpleSOM", inform_class, summarizerclass, summary_config_dict) @@ -73,6 +73,6 @@ def test_SimpeSOM_with_mag_and_colors(): "column_usage": "magandcolors", "objid_name": "id", } - inform_class = simpleSOM.Inform_SimpleSOMSummarizer - summarizerclass = simpleSOM.SimpleSOMSummarizer + inform_class = simpleSOM.InformSimpleSOM + summarizerclass = simpleSOM.SummarizeSimpleSOM _ = one_algo("SimpleSOM_wmag", inform_class, summarizerclass, summary_config_dict) From 246a383aad4ac61731d059b51d307b08dee1485c Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 15:07:44 -0400 Subject: [PATCH 05/12] propagating naming through SOM summarizers --- examples/estimation_examples/README.md | 2 +- src/rail/core/algo_utils.py | 23 +++++++- src/rail/estimation/README.md | 22 ++------ src/rail/estimation/algos/simpleSOM.py | 50 +++++++++--------- src/rail/estimation/algos/somocluSOM.py | 55 +++++++++++--------- tests/estimation/test_som_summarizers.py | 8 +-- tests/estimation/test_somoclu_summarizers.py | 8 +-- 7 files changed, 91 insertions(+), 77 deletions(-) diff --git a/examples/estimation_examples/README.md b/examples/estimation_examples/README.md index b4e98af7..58a45282 100644 --- a/examples/estimation_examples/README.md +++ b/examples/estimation_examples/README.md @@ -4,6 +4,6 @@ This directory contains example notebooks explaining how to use the RAIL Estimat - **RAIL_estimation_demo.ipynb** explains to inform a model used for photo-z estimation, and then how to use that model to estimate p(z). -- **SimpleSOM_demo.ipynb**, **somocluSOM_demo.ipynb**, and **somocluSOMcluster_demo.ipynb** demonstrate of the use of the `SimpleSOMSummarizer` summarization module. +- **SimpleSOM_demo.ipynb**, **somocluSOM_demo.ipynb**, and **somocluSOMcluster_demo.ipynb** demonstrate of the use of the `SummarizeSimpleSOM` and `SummarizeSOMoclu` `SZPZSummarizer` stages. - [test_sampled_summarizers.ipynb](https://lsstdescrail.readthedocs.io/en/latest/source/estimation-notebooks.html#testing-sampled-summarizers) outlines quick and dirty bootstrap versions of the `NaiveStack`, `PointEstimateHist`, and `VarInference` sumarizers, as well as `NZDir`. diff --git a/src/rail/core/algo_utils.py b/src/rail/core/algo_utils.py index 85d6d573..ae505449 100644 --- a/src/rail/core/algo_utils.py +++ b/src/rail/core/algo_utils.py @@ -1,4 +1,4 @@ -"""Utility functions to test alogrithms""" +"""Utility functions for estimation alogrithms""" import os from rail.core.stage import RailStage from rail.core.utils import RAILDIR @@ -63,3 +63,24 @@ def one_algo(key, single_trainer, single_estimator, train_kwargs, estim_kwargs): except FileNotFoundError: #pragma: no cover pass return estim.data, estim_2.data, estim_3.data + +def _computemagcolordata(data, ref_column_name, column_names, colusage): + # TODO: needs a docstring + if colusage not in ['colors', 'magandcolors', 'columns']: # pragma: no cover + raise ValueError(f"column usage value {colusage} is not valid, valid values are 'colors', 'magandcolors', and 'columns'") + numcols = len(column_names) + if colusage == 'magandcolors': + coldata = np.array(data[ref_column_name]) + for i in range(numcols - 1): + tmpcolor = data[column_names[i]] - data[column_names[i + 1]] + coldata = np.vstack((coldata, tmpcolor)) + if colusage == 'colors': + coldata = np.array(data[column_names[0]] - data[column_names[1]]) + for i in range(numcols - 2): + tmpcolor = data[column_names[i + 1]] - data[column_names[i + 2]] + coldata = np.vstack((coldata, tmpcolor)) + if colusage == 'columns': # pragma: no cover + coldata = np.array(data[column_names[0]]) + for i in range(numcols - 1): + coldata = np.vstack((coldata, np.array(data[column_names[i + 1]]))) + return coldata.T \ No newline at end of file diff --git a/src/rail/estimation/README.md b/src/rail/estimation/README.md index d0b97057..9bc2cdae 100644 --- a/src/rail/estimation/README.md +++ b/src/rail/estimation/README.md @@ -1,13 +1,7 @@ # RAIL estimation modules -This code enables the automatic execution of arbitrary redshift estimation codes in a common computing environment. - -## Motivation - -For the sake of this challenge, we will run scripts that accept test set photometry, run a particular pre-trained photo-z estimation code, and produce estimated photo-z posteriors. -Where possible, we wil use formats compatible with other LSST-DESC pipelines, including [TXPipe](https://github.com/LSSTDESC/TXPipe/). -Code here will provide a script template for wrapping a machine learning code that we will run automatically on a variety of test sets blinded from those who submit scripts. -We will have to make a decision about the acceptable output format(s) of redshift posteriors. +This code enables the execution of arbitrary redshift characterization codes with a shared API. +It includes classes for numerous algorithms for characterizing per-galaxy photo-z PDFs and ensemble galaxy sample redshift distributions. ## Structure @@ -19,15 +13,9 @@ Each must correspond to a config file in with any parameters the method needs, e In the `example` directory, run the following `python main.py configs/randomPZ.yaml` -## Immediate next steps - -`base.yaml` should not be hardcoded anywhere and should instead appear only in `main.py`. -`utils.py` is a placeholder and should be eliminated, and i/o functions should be migrated elsewhere. -There should be more examples of categories of nested config parameters in the `.yaml` files. -The `rail.estimation` module needs documentation and tests ASAP. - ## Future extensions +To sppplement the classes of stages defined in `estimation.py` and `summaization.py`, a `classification.py` module may be added to include algorithms for defining subsamples of a galaxy sample from their photometric data and/or photo-z data products, e.g. tomographic binning procedures. + It may not be possible to isolate some complex `degradation` effects in a shared training set, so future versions will require an additional script for each machine-learning-based code that executes a training step. -The estimation scripts for codes that do not naively apply machine learning to photometry, instead requiring observed information beyond photometry, will need to accept different forms of data, so we must design the estimation framework to be flexible about input formats. -Similarly, the framework must be flexible enough to provide an SED template library and priors to template-based redshift estimation codes. +The estimation scripts for codes that do not naively apply machine learning to photometry, instead requiring observed information beyond photometry such as positions or imaging, will need to accept different forms of data, so we must design the estimation framework to be flexible about input formats. diff --git a/src/rail/estimation/algos/simpleSOM.py b/src/rail/estimation/algos/simpleSOM.py index ee642d49..f4c4dec4 100644 --- a/src/rail/estimation/algos/simpleSOM.py +++ b/src/rail/estimation/algos/simpleSOM.py @@ -5,33 +5,33 @@ from rail.estimation.summarizer import SZPZSummarizer from rail.core.data import QPHandle, TableHandle from rail.core.common_params import SHARED_PARAMS +from rail.core.algo_utils import _computemagcolordata import qp +# def _computemagcolordata(data, ref_column_name, column_names, colusage): +# if colusage not in ['colors', 'magandcolors', 'columns']: # pragma: no cover +# raise ValueError(f"column usage value {colusage} is not valid, valid values are 'colors', 'magandcolors', and 'columns'") +# numcols = len(column_names) +# if colusage == 'magandcolors': +# coldata = np.array(data[ref_column_name]) +# for i in range(numcols - 1): +# tmpcolor = data[column_names[i]] - data[column_names[i + 1]] +# coldata = np.vstack((coldata, tmpcolor)) +# if colusage == 'colors': +# coldata = np.array(data[column_names[0]] - data[column_names[1]]) +# for i in range(numcols - 2): +# tmpcolor = data[column_names[i + 1]] - data[column_names[i + 2]] +# coldata = np.vstack((coldata, tmpcolor)) +# if colusage == 'columns': # pragma: no cover +# coldata = np.array(data[column_names[0]]) +# for i in range(numcols - 1): +# coldata = np.vstack((coldata, np.array(data[column_names[i + 1]]))) +# return coldata.T -def _computemagcolordata(data, ref_column_name, column_names, colusage): - if colusage not in ['colors', 'magandcolors', 'columns']: # pragma: no cover - raise ValueError(f"column usage value {colusage} is not valid, valid values are 'colors', 'magandcolors', and 'columns'") - numcols = len(column_names) - if colusage == 'magandcolors': - coldata = np.array(data[ref_column_name]) - for i in range(numcols - 1): - tmpcolor = data[column_names[i]] - data[column_names[i + 1]] - coldata = np.vstack((coldata, tmpcolor)) - if colusage == 'colors': - coldata = np.array(data[column_names[0]] - data[column_names[1]]) - for i in range(numcols - 2): - tmpcolor = data[column_names[i + 1]] - data[column_names[i + 2]] - coldata = np.vstack((coldata, tmpcolor)) - if colusage == 'columns': # pragma: no cover - coldata = np.array(data[column_names[0]]) - for i in range(numcols - 1): - coldata = np.vstack((coldata, np.array(data[column_names[i + 1]]))) - return coldata.T - -class InformSimpleSOM(CatInformer): +class SimpleSOMInformer(CatInformer): """Summarizer that uses a SOM to construct a weighted sum of spec-z objects in the same SOM cell as each photometric galaxy in order to estimate the overall N(z). This is @@ -73,7 +73,7 @@ class InformSimpleSOM(CatInformer): pickle file containing the `minisom` SOM object that will be used by the estimation/summarization stage """ - name = 'InformSimpleSOM' + name = 'SimpleSOMInformer' config_options = CatInformer.config_options.copy() config_options.update(nondetect_val=SHARED_PARAMS, mag_limits=SHARED_PARAMS, @@ -130,7 +130,7 @@ def run(self): self.add_data('model', self.model) -class SummarizeSimpleSOM(SZPZSummarizer): +class SimpleSOMSummarizer(SZPZSummarizer): """Quick implementation of a SOM-based summarizer that constructs and N(z) estimate via a weighted sum of the empirical N(z) consisting of the normalized histogram @@ -192,7 +192,7 @@ class SummarizeSimpleSOM(SZPZSummarizer): qp_ens: qp Ensemble ensemble of bootstrap realizations of the estimated N(z) for the input photometric data """ - name = 'SummarizeSimpleSOM' + name = 'SimpleSOMSummarizer' config_options = SZPZSummarizer.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, @@ -328,3 +328,5 @@ def run(self): self.add_data('single_NZ', qp_d) self.add_data('uncovered_cell_file', bad_pix) self.add_data('cellid_output', id_dict) + +# TODO: add EstimateSimpleSOM \ No newline at end of file diff --git a/src/rail/estimation/algos/somocluSOM.py b/src/rail/estimation/algos/somocluSOM.py index 261d3466..8ac2b787 100644 --- a/src/rail/estimation/algos/somocluSOM.py +++ b/src/rail/estimation/algos/somocluSOM.py @@ -13,28 +13,30 @@ import sklearn.cluster as sc from scipy.spatial.distance import cdist from rail.core.common_params import SHARED_PARAMS - - - -def _computemagcolordata(data, ref_column_name, column_names, colusage): - if colusage not in ['colors', 'magandcolors', 'columns']: # pragma: no cover - raise ValueError(f"column usage value {colusage} is not valid, valid values are 'colors', 'magandcolors', and 'columns'") - numcols = len(column_names) - if colusage == 'magandcolors': - coldata = np.array(data[ref_column_name]) - for i in range(numcols - 1): - tmpcolor = data[column_names[i]] - data[column_names[i + 1]] - coldata = np.vstack((coldata, tmpcolor)) - if colusage == 'colors': - coldata = np.array(data[column_names[0]] - data[column_names[1]]) - for i in range(numcols - 2): - tmpcolor = data[column_names[i + 1]] - data[column_names[i + 2]] - coldata = np.vstack((coldata, tmpcolor)) - if colusage == 'columns': # pragma: no cover - coldata = np.array(data[column_names[0]]) - for i in range(numcols - 1): - coldata = np.vstack((coldata, np.array(data[column_names[i + 1]]))) - return coldata.T +from rail.core.algo_utils import _computemagcolordata + +# TODO: review whether these pragmas are justified + + +# def _computemagcolordata(data, ref_column_name, column_names, colusage): +# if colusage not in ['colors', 'magandcolors', 'columns']: # pragma: no cover +# raise ValueError(f"column usage value {colusage} is not valid, valid values are 'colors', 'magandcolors', and 'columns'") +# numcols = len(column_names) +# if colusage == 'magandcolors': +# coldata = np.array(data[ref_column_name]) +# for i in range(numcols - 1): +# tmpcolor = data[column_names[i]] - data[column_names[i + 1]] +# coldata = np.vstack((coldata, tmpcolor)) +# if colusage == 'colors': +# coldata = np.array(data[column_names[0]] - data[column_names[1]]) +# for i in range(numcols - 2): +# tmpcolor = data[column_names[i + 1]] - data[column_names[i + 2]] +# coldata = np.vstack((coldata, tmpcolor)) +# if colusage == 'columns': # pragma: no cover +# coldata = np.array(data[column_names[0]]) +# for i in range(numcols - 1): +# coldata = np.vstack((coldata, np.array(data[column_names[i + 1]]))) +# return coldata.T def get_bmus(som, data=None, split=200): # pragma: no cover @@ -123,7 +125,7 @@ def plot_som(ax, som_map, grid_type='rectangular', colormap=cm.viridis, cbar_nam ax.axis('off') -class Inform_somocluSOMSummarizer(CatInformer): +class SOMocluInformer(CatInformer): """Summarizer that uses a SOM to construct a weighted sum of spec-z objects in the same SOM cell as each photometric galaxy in order to estimate the overall N(z). This is @@ -169,7 +171,7 @@ class Inform_somocluSOMSummarizer(CatInformer): pickle file containing the `somoclu` SOM object that will be used by the estimation/summarization stage """ - name = 'Inform_SOMoclu' + name = 'SOMocluInformer' config_options = CatInformer.config_options.copy() config_options.update(nondetect_val=SHARED_PARAMS, mag_limits=SHARED_PARAMS, @@ -233,7 +235,7 @@ def run(self): self.add_data('model', self.model) -class somocluSOMSummarizer(SZPZSummarizer): +class SOMocluSummarizer(SZPZSummarizer): """Quick implementation of a SOM-based summarizer. It will group a pre-trained SOM into hierarchical clusters and assign a galaxy sample into SOM cells and clusters. Then it @@ -298,7 +300,7 @@ class somocluSOMSummarizer(SZPZSummarizer): qp_ens: qp Ensemble ensemble of bootstrap realizations of the estimated N(z) for the input photometric data """ - name = 'somocluSOMSummarizer' + name = 'SOMocluSummarizer' config_options = SZPZSummarizer.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, @@ -544,3 +546,4 @@ def _do_chunk_output(self, id_dict, start, end, first): self._cellid_handle.set_data(id_dict, partial=True) self._cellid_handle.write_chunk(start, end) +# TODO: add EstimateSOMoclu \ No newline at end of file diff --git a/tests/estimation/test_som_summarizers.py b/tests/estimation/test_som_summarizers.py index 3f717063..a11c897c 100644 --- a/tests/estimation/test_som_summarizers.py +++ b/tests/estimation/test_som_summarizers.py @@ -61,8 +61,8 @@ def one_algo(key, inform_class, summarizer_class, summary_kwargs): def test_SimpleSOM(): summary_config_dict = {"m_dim": 21, "n_dim": 21, "column_usage": "colors"} - inform_class = simpleSOM.InformSimpleSOM - summarizerclass = simpleSOM.SummarizeSimpleSOM + inform_class = simpleSOM.SimpleSOMInformer + summarizerclass = simpleSOM.SimpleSOMSummarizer _ = one_algo("SimpleSOM", inform_class, summarizerclass, summary_config_dict) @@ -73,6 +73,6 @@ def test_SimpeSOM_with_mag_and_colors(): "column_usage": "magandcolors", "objid_name": "id", } - inform_class = simpleSOM.InformSimpleSOM - summarizerclass = simpleSOM.SummarizeSimpleSOM + inform_class = simpleSOM.SimpleSOMInformer + summarizerclass = simpleSOM.SimpleSOMSummarizer _ = one_algo("SimpleSOM_wmag", inform_class, summarizerclass, summary_config_dict) diff --git a/tests/estimation/test_somoclu_summarizers.py b/tests/estimation/test_somoclu_summarizers.py index db811c73..e351f968 100644 --- a/tests/estimation/test_somoclu_summarizers.py +++ b/tests/estimation/test_somoclu_summarizers.py @@ -61,8 +61,8 @@ def one_algo(key, inform_class, summarizer_class, summary_kwargs): def test_SomocluSOM(): summary_config_dict = {"n_rows": 21, "n_columns": 21, "column_usage": "colors"} - inform_class = somocluSOM.Inform_somocluSOMSummarizer - summarizerclass = somocluSOM.somocluSOMSummarizer + inform_class = somocluSOM.SOMocluInformer + summarizerclass = somocluSOM.SOMocluSummarizer _ = one_algo("SOMomoclu", inform_class, summarizerclass, summary_config_dict) @@ -73,6 +73,6 @@ def test_SomocluSOM_with_mag_and_colors(): "column_usage": "magandcolors", "objid_name": "id", } - inform_class = somocluSOM.Inform_somocluSOMSummarizer - summarizerclass = somocluSOM.somocluSOMSummarizer + inform_class = somocluSOM.SOMocluInformer + summarizerclass = somocluSOM.SOMocluSummarizer _ = one_algo("SOMoclu_wmag", inform_class, summarizerclass, summary_config_dict) From 88603a69d7780d3b9cb8a939e492f3ebf6e93e0d Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 15:20:23 -0400 Subject: [PATCH 06/12] fixing dummy estimators --- examples/goldenspike_examples/goldenspike.yml | 4 ++-- src/rail/estimation/algos/trainZ.py | 8 ++++---- src/rail/estimation/estimator.py | 4 ++++ src/rail/estimation/summarizer.py | 8 ++++++++ tests/estimation/test_algos.py | 4 ++-- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/examples/goldenspike_examples/goldenspike.yml b/examples/goldenspike_examples/goldenspike.yml index 37857b0e..45a58ddf 100644 --- a/examples/goldenspike_examples/goldenspike.yml +++ b/examples/goldenspike_examples/goldenspike.yml @@ -42,13 +42,13 @@ stages: - classname: QuantityCut name: quantity_cut nprocess: 1 -- classname: InformTrainZ +- classname: TrainZInformer name: inform_trainZ nprocess: 1 - classname: Estimator name: estimate_bpz nprocess: 1 -- classname: EstimateTrainZ +- classname: TrainZEstimator name: estimate_trainZ nprocess: 1 - classname: RandomPZ diff --git a/src/rail/estimation/algos/trainZ.py b/src/rail/estimation/algos/trainZ.py index 47b6bcef..6f389d45 100644 --- a/src/rail/estimation/algos/trainZ.py +++ b/src/rail/estimation/algos/trainZ.py @@ -23,11 +23,11 @@ def __init__(self, zgrid, pdf, zmode): self.zmode = zmode -class InformTrainZ(CatInformer): +class TrainZInformer(CatInformer): """Train an Estimator which returns a global PDF for all galaxies """ - name = 'InformTrainZ' + name = 'TrainZInformer' config_options = CatInformer.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, @@ -56,11 +56,11 @@ def run(self): self.add_data('model', self.model) -class EstimateTrainZ(CatEstimator): +class TrainZEstimator(CatEstimator): """CatEstimator which returns a global PDF for all galaxies """ - name = 'EstimateTrainZ' + name = 'TrainZEstimator' config_options = CatEstimator.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, diff --git a/src/rail/estimation/estimator.py b/src/rail/estimation/estimator.py index f5974b92..3b0c42f2 100644 --- a/src/rail/estimation/estimator.py +++ b/src/rail/estimation/estimator.py @@ -40,6 +40,8 @@ def inform(self, training_data): Finally, this will return a ModelHandle providing access to the trained model. + Your subclasses should be named `[AlgoName]Informer` in accordance with the standards. + Parameters ---------- input_data : `dict` or `TableHandle` @@ -65,6 +67,8 @@ class CatEstimator(RailStage): They take as "input" tabular data, apply the photo-z estimation and provide as "output" a QPEnsemble, with per-object p(z). + Your subclasses should be named `[AlgoName]Estimator` in accordance with the standards. + """ name = 'CatEstimator' diff --git a/src/rail/estimation/summarizer.py b/src/rail/estimation/summarizer.py index dc025093..5c0fc60b 100644 --- a/src/rail/estimation/summarizer.py +++ b/src/rail/estimation/summarizer.py @@ -12,6 +12,8 @@ class PzInformer(RailStage): #pragma: no cover All Summarizes must have an associated Informer that produces the models given inputs such as training sets or SED template libraries with priors, hence the generic name; while "Trainer" would be accurate for data-driven estimators, "Informer" also encompases model-fitting methods ingesting prior information. They take as "input" a qp.Ensemble of per-galaxy p(z) data, which is used to "inform" the model. + + Your subclasses should be named `[AlgoName]Informer` in accordance with the standards. """ name = 'PzInformer' @@ -63,6 +65,8 @@ class CatSummarizer(RailStage): #pragma: no cover table with fluxes in photometric bands among the set of columns. It provides as "output" a QPEnsemble, with a single n(z) or samples thereof. + + Your subclasses should be named `[AlgoName]Summarizer` in accordance with the standards. """ name = 'CatSummarizer' @@ -111,6 +115,8 @@ class PZSummarizer(RailStage): PZSummarizer take as "input" a `qp.Ensemble` with per-galaxy PDFs, and provide as "output" a QPEnsemble, with per-ensemble n(z). + + Your subclasses should be named `[AlgoName]Summarizer` in accordance with the standards. """ name = 'PZtoNZSummarizer' @@ -158,6 +164,8 @@ class SZPZSummarizer(RailStage): """The base class for classes that use two sets of data: a photometry sample with spec-z values, and a photometry sample with unknown redshifts, e.g. simpleSOM and outputs a QP Ensemble with bootstrap realization of the N(z) distribution + + Your subclasses should be named `[AlgoName]Summarizer` in accordance with the standards. """ name = 'SZPZtoNZSummarizer' config_options = RailStage.config_options.copy() diff --git a/tests/estimation/test_algos.py b/tests/estimation/test_algos.py index 87596d3e..dc8767b5 100644 --- a/tests/estimation/test_algos.py +++ b/tests/estimation/test_algos.py @@ -124,8 +124,8 @@ def test_train_pz(): zb_expected = np.repeat(0.1445183, 10) pdf_expected = np.zeros(shape=(301,)) pdf_expected[10:16] = [7, 23, 8, 23, 26, 13] - train_algo = trainZ.InformTrainZ - pz_algo = trainZ.EstimateTrainZ + train_algo = trainZ.TrainZInformer + pz_algo = trainZ.TrainZEstimator results, rerun_results, rerun3_results = one_algo( "TrainZ", train_algo, pz_algo, train_config_dict, estim_config_dict ) From 0da14810d5c73bb625c1756c2272472c8dda3ebb Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 15:30:46 -0400 Subject: [PATCH 07/12] import numpy here --- src/rail/core/algo_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/rail/core/algo_utils.py b/src/rail/core/algo_utils.py index ae505449..5b4e0710 100644 --- a/src/rail/core/algo_utils.py +++ b/src/rail/core/algo_utils.py @@ -1,4 +1,5 @@ """Utility functions for estimation alogrithms""" +import numpy as np import os from rail.core.stage import RailStage from rail.core.utils import RAILDIR From 5c0d7e5d7316dea57457f0c3375892aaa4f82756 Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 16:05:44 -0400 Subject: [PATCH 08/12] random comments and sklearnNN naming update --- src/rail/core/algo_utils.py | 2 ++ .../estimation/algos/{sklearn_nn.py => sklearnNN.py} | 8 ++++---- src/rail/estimation/estimator.py | 8 ++++++-- tests/estimation/test_algos.py | 10 +++++----- 4 files changed, 17 insertions(+), 11 deletions(-) rename src/rail/estimation/algos/{sklearn_nn.py => sklearnNN.py} (97%) diff --git a/src/rail/core/algo_utils.py b/src/rail/core/algo_utils.py index 5b4e0710..6e49c7b5 100644 --- a/src/rail/core/algo_utils.py +++ b/src/rail/core/algo_utils.py @@ -21,6 +21,8 @@ def one_algo(key, single_trainer, single_estimator, train_kwargs, estim_kwargs): 'tempmodelfile.tmp', run photo-z algorithm. Then, load temp modelfile and re-run, return both datasets. + + TODO: what are the parameter types? """ DS.clear() training_data = DS.read_file('training_data', TableHandle, traindata) diff --git a/src/rail/estimation/algos/sklearn_nn.py b/src/rail/estimation/algos/sklearnNN.py similarity index 97% rename from src/rail/estimation/algos/sklearn_nn.py rename to src/rail/estimation/algos/sklearnNN.py index 74febd48..2c6686e7 100644 --- a/src/rail/estimation/algos/sklearn_nn.py +++ b/src/rail/estimation/algos/sklearnNN.py @@ -58,7 +58,7 @@ def regularize_data(data): return regularized_data -class Inform_SimpleNN(CatInformer): +class SimpleNNInformer(CatInformer): """ Subclass to train a simple point estimate Neural Net photoz rather than actually predict PDF, for now just predict point zb @@ -66,7 +66,7 @@ class Inform_SimpleNN(CatInformer): photo-z later. """ - name = 'Inform_SimpleNN' + name = 'SimpleNNInformer' config_options = CatInformer.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, @@ -113,14 +113,14 @@ def run(self): self.add_data('model', self.model) -class SimpleNN(CatEstimator): +class SimpleNNEstimator(CatEstimator): """ Subclass to implement a simple point estimate Neural Net photoz rather than actually predict PDF, for now just predict point zb and then put an error of width*(1+zb). We'll do a "real" NN photo-z later. """ - name = 'SimpleNN' + name = 'SimpleNNEstimator' config_options = CatEstimator.config_options.copy() config_options.update(width=Param(float, 0.05, msg="The ad hoc base width of the PDFs"), ref_band=SHARED_PARAMS, diff --git a/src/rail/estimation/estimator.py b/src/rail/estimation/estimator.py index 3b0c42f2..a5e3d17d 100644 --- a/src/rail/estimation/estimator.py +++ b/src/rail/estimation/estimator.py @@ -67,8 +67,7 @@ class CatEstimator(RailStage): They take as "input" tabular data, apply the photo-z estimation and provide as "output" a QPEnsemble, with per-object p(z). - Your subclasses should be named `[AlgoName]Estimator` in accordance with the standards. - + Your subclasses should be named `[AlgoName]Estimator` in accordance with the standards. """ name = 'CatEstimator' @@ -130,6 +129,11 @@ def estimate(self, input_data): Finally, this will return a QPHandle providing access to that output data. + TODO: The handling of zmode should happen here so we don't have to change it in every estimator when it is made optional. + + TODO: How are we storing the outputs of algorithms that yield only a point estimate? qp needs a samples parameterization for this! + + Parameters ---------- input_data : `dict` or `ModelHandle` diff --git a/tests/estimation/test_algos.py b/tests/estimation/test_algos.py index dc8767b5..14d5c561 100644 --- a/tests/estimation/test_algos.py +++ b/tests/estimation/test_algos.py @@ -4,7 +4,7 @@ from rail.core.algo_utils import one_algo from rail.core.stage import RailStage -from rail.estimation.algos import knnpz, pzflow, randomPZ, sklearn_nn, trainZ +from rail.estimation.algos import knnpz, pzflow, randomPZ, sklearnNN, trainZ sci_ver_str = scipy.__version__.split(".") @@ -47,8 +47,8 @@ def test_simple_nn(): } estim_config_dict = {"hdf5_groupname": "photometry", "model": "model.tmp"} # zb_expected = np.array([0.152, 0.135, 0.109, 0.158, 0.113, 0.176, 0.13 , 0.15 , 0.119, 0.133]) - train_algo = sklearn_nn.Inform_SimpleNN - pz_algo = sklearn_nn.SimpleNN + train_algo = sklearn_nn.SimpleNNInformer + pz_algo = sklearn_nn.SimpleNNEstimator results, rerun_results, rerun3_results = one_algo( "SimpleNN", train_algo, pz_algo, train_config_dict, estim_config_dict ) @@ -183,6 +183,6 @@ def test_KNearNeigh(): def test_catch_bad_bands(): params = dict(bands="u,g,r,i,z,y") with pytest.raises(ValueError): - sklearn_nn.Inform_SimpleNN.make_stage(hdf5_groupname="", **params) + sklearn_nn.SimpleNNInformer.make_stage(hdf5_groupname="", **params) with pytest.raises(ValueError): - sklearn_nn.SimpleNN.make_stage(hdf5_groupname="", **params) + sklearn_nn.SimpleNNEstimator.make_stage(hdf5_groupname="", **params) From 56b224e41ae4eb89f0f1e839ee747a7a78eee975 Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 16:07:00 -0400 Subject: [PATCH 09/12] forgotten changes to sklearnNN and todos --- src/rail/estimation/algos/README.md | 2 +- src/rail/estimation/algos/sklearnNN.py | 3 +++ src/rail/estimation/informer.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/rail/estimation/algos/README.md b/src/rail/estimation/algos/README.md index 4eb7fbde..7944ea49 100644 --- a/src/rail/estimation/algos/README.md +++ b/src/rail/estimation/algos/README.md @@ -14,7 +14,7 @@ The usual procedure for a gridded parameterization is to define the redshift eva - `load_model`: boolean, if True codes should skip inform and load a pretrained model from the filename specified in `modelfile`. - `save_train`: boolean, if True codes should save the model as computed during the running of `inform` to the filename in `modelfile`. -TODO: move these detailed explanations to docstring of each algo so they show up in the API documentation! +**TODO: move these detailed explanations to docstring of each algo so they show up in the API documentation!** # BPZ_Lite diff --git a/src/rail/estimation/algos/sklearnNN.py b/src/rail/estimation/algos/sklearnNN.py index 2c6686e7..0ae88302 100644 --- a/src/rail/estimation/algos/sklearnNN.py +++ b/src/rail/estimation/algos/sklearnNN.py @@ -135,6 +135,9 @@ def __init__(self, args, comm=None): raise ValueError("ref_band is not in list of bands!") def _process_chunk(self, start, end, data, first): + """ + TODO: zmode here is not actually the mode! It's the MLE. This is another reason not to by default include the point estimate, because it matters which point estimate it is and we're imposing an assumption that "point estimate" == mode + """ color_data = make_color_data(data, self.config.bands, self.config.ref_band, self.config.nondetect_val) input_data = regularize_data(color_data) diff --git a/src/rail/estimation/informer.py b/src/rail/estimation/informer.py index 4c018944..a0dac30d 100644 --- a/src/rail/estimation/informer.py +++ b/src/rail/estimation/informer.py @@ -1 +1 @@ -# TODO: put informers here since they only care about input types and can be used for estimators, summarizers, and tomographers \ No newline at end of file +# TODO: put informers here since they only care about input types and can be used for methods that are associated with multiple of estimators, summarizers, and classifiers (tomographers) \ No newline at end of file From b5d2a961dc16ccf146a6a01d4ff397294d6aa5e8 Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 16:13:58 -0400 Subject: [PATCH 10/12] updating module name --- tests/estimation/test_algos.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/estimation/test_algos.py b/tests/estimation/test_algos.py index 14d5c561..8a68436d 100644 --- a/tests/estimation/test_algos.py +++ b/tests/estimation/test_algos.py @@ -47,8 +47,8 @@ def test_simple_nn(): } estim_config_dict = {"hdf5_groupname": "photometry", "model": "model.tmp"} # zb_expected = np.array([0.152, 0.135, 0.109, 0.158, 0.113, 0.176, 0.13 , 0.15 , 0.119, 0.133]) - train_algo = sklearn_nn.SimpleNNInformer - pz_algo = sklearn_nn.SimpleNNEstimator + train_algo = sklearnNN.SimpleNNInformer + pz_algo = sklearnNN.SimpleNNEstimator results, rerun_results, rerun3_results = one_algo( "SimpleNN", train_algo, pz_algo, train_config_dict, estim_config_dict ) @@ -183,6 +183,6 @@ def test_KNearNeigh(): def test_catch_bad_bands(): params = dict(bands="u,g,r,i,z,y") with pytest.raises(ValueError): - sklearn_nn.SimpleNNInformer.make_stage(hdf5_groupname="", **params) + sklearnNN.SimpleNNInformer.make_stage(hdf5_groupname="", **params) with pytest.raises(ValueError): - sklearn_nn.SimpleNNEstimator.make_stage(hdf5_groupname="", **params) + sklearnNN.SimpleNNEstimator.make_stage(hdf5_groupname="", **params) From c32d454622865854ae791271011dadc1e6402d08 Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 17:31:34 -0400 Subject: [PATCH 11/12] renaming module to match stages in it --- src/rail/estimation/algos/{sklearnNN.py => simpleNN.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/rail/estimation/algos/{sklearnNN.py => simpleNN.py} (100%) diff --git a/src/rail/estimation/algos/sklearnNN.py b/src/rail/estimation/algos/simpleNN.py similarity index 100% rename from src/rail/estimation/algos/sklearnNN.py rename to src/rail/estimation/algos/simpleNN.py From 7bdb83fa782298572d96fdfc457253f17fd65f4b Mon Sep 17 00:00:00 2001 From: Alex Malz Date: Fri, 9 Jun 2023 17:48:29 -0400 Subject: [PATCH 12/12] naive stack naming propagation --- docs/source/contributing.rst | 7 ++--- examples/estimation_examples/README.md | 4 +-- .../test_sampled_summarizers.ipynb | 22 ++++++++++++++-- .../goldenspike_examples/goldenspike.ipynb | 26 ++++++++++++++++--- examples/goldenspike_examples/goldenspike.yml | 3 ++- src/rail/estimation/algos/naiveStack.py | 9 ++++--- .../goldenspike_data/goldenspike.yml | 2 +- tests/estimation/test_algos.py | 3 ++- tests/estimation/test_summarizers.py | 4 +-- 9 files changed, 62 insertions(+), 18 deletions(-) diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index fa2add9f..a780f0ce 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -138,11 +138,12 @@ Here is an example of a slightly more complicated `RailStage`. .. code-block:: python - class NaiveStack(PZSummarizer): - """Summarizer which simply histograms a point estimate + class NaiveStackSummarizer(PZSummarizer): + """ + Summarizer taking an average of a qp.Ensemble of PDFs. """ - name = 'NaiveStack' + name = 'NaiveStackSummarizer' config_options = PZSummarizer.config_options.copy() config_options.update(zmin=Param(float, 0.0, msg="The minimum redshift of the z grid"), zmax=Param(float, 3.0, msg="The maximum redshift of the z grid"), diff --git a/examples/estimation_examples/README.md b/examples/estimation_examples/README.md index 58a45282..ce7940c7 100644 --- a/examples/estimation_examples/README.md +++ b/examples/estimation_examples/README.md @@ -4,6 +4,6 @@ This directory contains example notebooks explaining how to use the RAIL Estimat - **RAIL_estimation_demo.ipynb** explains to inform a model used for photo-z estimation, and then how to use that model to estimate p(z). -- **SimpleSOM_demo.ipynb**, **somocluSOM_demo.ipynb**, and **somocluSOMcluster_demo.ipynb** demonstrate of the use of the `SummarizeSimpleSOM` and `SummarizeSOMoclu` `SZPZSummarizer` stages. +- **SimpleSOM_demo.ipynb**, **somocluSOM_demo.ipynb**, and **somocluSOMcluster_demo.ipynb** demonstrate of the use of the `SimpleSOMSummarizer` and `SOMocluSummarizer` `SZPZSummarizer` stages. -- [test_sampled_summarizers.ipynb](https://lsstdescrail.readthedocs.io/en/latest/source/estimation-notebooks.html#testing-sampled-summarizers) outlines quick and dirty bootstrap versions of the `NaiveStack`, `PointEstimateHist`, and `VarInference` sumarizers, as well as `NZDir`. +- [test_sampled_summarizers.ipynb](https://lsstdescrail.readthedocs.io/en/latest/source/estimation-notebooks.html#testing-sampled-summarizers) outlines quick and dirty bootstrap versions of the `NaiveStackSummarizer`, `PointEstimateHist`, and `VarInference` sumarizers, as well as `NZDir`. diff --git a/examples/estimation_examples/test_sampled_summarizers.ipynb b/examples/estimation_examples/test_sampled_summarizers.ipynb index 29890e5e..cf4c8d2c 100644 --- a/examples/estimation_examples/test_sampled_summarizers.ipynb +++ b/examples/estimation_examples/test_sampled_summarizers.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "c697f8c4", "metadata": {}, @@ -10,6 +11,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "042969c1-911f-4c35-a1dd-f90befe099dc", "metadata": {}, @@ -52,7 +54,7 @@ "outputs": [], "source": [ "from rail.estimation.algos.varInference import VarInferenceStack\n", - "from rail.estimation.algos.naiveStack import NaiveStack\n", + "from rail.estimation.algos.naiveStack import NaiveStackSummarizer\n", "from rail.estimation.algos.pointEstimateHist import PointEstimateHist\n", "from rail.estimation.algos.NZDir import Inform_NZDir, NZDir\n", "from rail.core.data import TableHandle, QPHandle\n", @@ -81,6 +83,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ddccd67d-7321-4e8b-b4c0-e5d7e6aacc52", "metadata": {}, @@ -149,6 +152,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5c63e8af-fd52-4ada-8d20-3e0e7f47b0f7", "metadata": {}, @@ -159,6 +163,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3f351b58-29b0-4831-b5ac-6ab0969c4d3d", "metadata": {}, @@ -175,7 +180,7 @@ "metadata": {}, "outputs": [], "source": [ - "stacker = NaiveStack.make_stage(zmin=0.0, zmax=3.0, nzbins=41, nsamples=20, output=\"Naive_samples.hdf5\", single_NZ=\"NaiveStack_NZ.hdf5\")" + "stacker = NaiveStackSummarizer.make_stage(zmin=0.0, zmax=3.0, nzbins=41, nsamples=20, output=\"Naive_samples.hdf5\", single_NZ=\"NaiveStack_NZ.hdf5\")" ] }, { @@ -189,6 +194,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "241b499f-b9f3-4780-b909-4383d1160682", "metadata": {}, @@ -222,6 +228,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ca3cb989-7fb2-4e9a-b5e8-ae7b430d0720", "metadata": {}, @@ -241,6 +248,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "540e66b7-28f0-483d-a627-00a88fec063c", "metadata": {}, @@ -297,6 +305,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f669af79-03ff-4e97-bd0b-cbb2c4a2e8fb", "metadata": {}, @@ -305,6 +314,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a390c5cb-9899-4e56-a872-04d1927a0dc7", "metadata": {}, @@ -348,6 +358,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f2395767-c65c-4e5a-af0e-b243e6df4bab", "metadata": {}, @@ -367,6 +378,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "42416fa7-0cce-4fed-965f-616eb916a78e", "metadata": {}, @@ -455,6 +467,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a870aa08-7de2-49c1-bd9a-cfda3eb17631", "metadata": {}, @@ -483,6 +496,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bfa7ee13-1f2e-4549-8bca-a5e3e78b61cd", "metadata": {}, @@ -520,6 +534,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a7cf54a8-e742-47c4-8953-e4a43874d740", "metadata": {}, @@ -571,6 +586,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f9020a5f-2820-4b61-bf08-b29d9f14014d", "metadata": {}, @@ -607,6 +623,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "607dff41-495d-4da3-9c9e-56d2d60ba90f", "metadata": {}, @@ -645,6 +662,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7c0b6442-68ce-4a7f-b7be-8727a364deec", "metadata": {}, diff --git a/examples/goldenspike_examples/goldenspike.ipynb b/examples/goldenspike_examples/goldenspike.ipynb index 27b36f1c..37a73b65 100644 --- a/examples/goldenspike_examples/goldenspike.ipynb +++ b/examples/goldenspike_examples/goldenspike.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "minute-lender", "metadata": {}, @@ -34,6 +35,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "banner-migration", "metadata": {}, @@ -75,7 +77,7 @@ "from rail.estimation.algos.knnpz import Inform_KNearNeighPDF, KNearNeighPDF\n", "from rail.estimation.algos.flexzboost import Inform_FZBoost, FZBoost\n", "\n", - "from rail.estimation.algos.naiveStack import NaiveStack\n", + "from rail.estimation.algos.naiveStack import NaiveStackSummarizer\n", "from rail.estimation.algos.pointEstimateHist import PointEstimateHist\n", "\n", "from rail.evaluation.evaluator import Evaluator\n", @@ -83,6 +85,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "scheduled-chamber", "metadata": {}, @@ -106,6 +109,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "brief-institution", "metadata": {}, @@ -128,6 +132,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "66494399", "metadata": {}, @@ -158,6 +163,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0cd8b319", "metadata": {}, @@ -191,6 +197,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2368b6b2", "metadata": {}, @@ -219,6 +226,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "nonprofit-interference", "metadata": {}, @@ -307,6 +315,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "above-portable", "metadata": {}, @@ -326,6 +335,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "clinical-pavilion", "metadata": {}, @@ -334,6 +344,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "square-breeding", "metadata": {}, @@ -412,6 +423,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "formal-camping", "metadata": {}, @@ -464,6 +476,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "29d6f99e", "metadata": {}, @@ -484,6 +497,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "colonial-trailer", "metadata": {}, @@ -539,6 +553,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "right-mystery", "metadata": {}, @@ -568,6 +583,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "danish-miller", "metadata": {}, @@ -617,6 +633,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "grave-speaking", "metadata": {}, @@ -636,7 +653,7 @@ "outputs": [], "source": [ "point_estimate_test = PointEstimateHist.make_stage(name='point_estimate_test')\n", - "naive_stack_test = NaiveStack.make_stage(name='naive_stack_test')" + "naive_stack_test = NaiveStackSummarizer.make_stage(name='naive_stack_test')" ] }, { @@ -671,6 +688,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "medical-preview", "metadata": {}, @@ -729,6 +747,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "younger-testament", "metadata": {}, @@ -757,6 +776,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "informational-performer", "metadata": {}, @@ -799,7 +819,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/examples/goldenspike_examples/goldenspike.yml b/examples/goldenspike_examples/goldenspike.yml index 45a58ddf..8ccb4693 100644 --- a/examples/goldenspike_examples/goldenspike.yml +++ b/examples/goldenspike_examples/goldenspike.yml @@ -46,6 +46,7 @@ stages: name: inform_trainZ nprocess: 1 - classname: Estimator +# TODO: it looks like the CatEstimator for BPZ is called "BPZ_lite" right now -- check if this is a problem name: estimate_bpz nprocess: 1 - classname: TrainZEstimator @@ -57,6 +58,6 @@ stages: - classname: PointEstimateHist name: point_estimate_test nprocess: 1 -- classname: NaiveStack +- classname: NaiveStackSummarizer name: naive_stack_test nprocess: 1 diff --git a/src/rail/estimation/algos/naiveStack.py b/src/rail/estimation/algos/naiveStack.py index be6463f5..0e8ad4c5 100644 --- a/src/rail/estimation/algos/naiveStack.py +++ b/src/rail/estimation/algos/naiveStack.py @@ -1,5 +1,7 @@ """ A summarizer that simple makes a histogram of a point estimate + +TODO: needs an informer """ import numpy as np @@ -9,11 +11,12 @@ import qp -class NaiveStack(PZSummarizer): - """Summarizer which simply histograms a point estimate +class NaiveStackSummarizer(PZSummarizer): + """ + Summarizer taking an average of a qp.Ensemble of PDFs. """ - name = 'NaiveStack' + name = 'NaiveStackSummarizer' config_options = PZSummarizer.config_options.copy() config_options.update(zmin=Param(float, 0.0, msg="The minimum redshift of the z grid"), zmax=Param(float, 3.0, msg="The maximum redshift of the z grid"), diff --git a/src/rail/examples_data/goldenspike_data/goldenspike.yml b/src/rail/examples_data/goldenspike_data/goldenspike.yml index eb25393f..71bd24d4 100644 --- a/src/rail/examples_data/goldenspike_data/goldenspike.yml +++ b/src/rail/examples_data/goldenspike_data/goldenspike.yml @@ -57,6 +57,6 @@ stages: - classname: PointEstimateHist name: point_estimate_test nprocess: 1 -- classname: NaiveStack +- classname: NaiveStackSummarizer name: naive_stack_test nprocess: 1 diff --git a/tests/estimation/test_algos.py b/tests/estimation/test_algos.py index 8a68436d..a7c40085 100644 --- a/tests/estimation/test_algos.py +++ b/tests/estimation/test_algos.py @@ -4,7 +4,8 @@ from rail.core.algo_utils import one_algo from rail.core.stage import RailStage -from rail.estimation.algos import knnpz, pzflow, randomPZ, sklearnNN, trainZ +from rail.estimation.algos import knnpz, pzflow, randomPZ, simpleNN, trainZ +from rail.estimation.algos import simpleNN as sklearnNN sci_ver_str = scipy.__version__.split(".") diff --git a/tests/estimation/test_summarizers.py b/tests/estimation/test_summarizers.py index f66f5f89..1272c6f9 100644 --- a/tests/estimation/test_summarizers.py +++ b/tests/estimation/test_summarizers.py @@ -29,8 +29,8 @@ def one_algo(key, summarizer_class, summary_kwargs): def test_naive_stack(): summary_config_dict = {} - summarizer_class = naiveStack.NaiveStack - results = one_algo("NaiveStack", summarizer_class, summary_config_dict) + summarizer_class = naiveStack.NaiveStackSummarizer + results = one_algo("NaiveStackSummarizer", summarizer_class, summary_config_dict) def test_point_estimate_hist():