diff --git a/paper/paper.bib b/paper/paper.bib index 9fed8a49..dad3cd3f 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -165,3 +165,48 @@ @article{core2023ipcc year={2023}, doi={doi: 10.59327/IPCC/AR6-9789291691647.001} } + +@article{harris2020array, + title={Array programming with {NumPy}}, + author={Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. van der Walt and Ralf Gommers and Pauli Virtanen and David Cournapeau and Eric Wieser and Julian Taylor and Sebastian Berg and Nathaniel J. Smith and Robert Kern and Matti Picus and Stephan Hoyer and Marten H. van Kerkwijk and Matthew Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and Warren Weckesser and Hameer Abbasi and Christoph Gohlke and Travis E. Oliphant}, + year={2020}, + month=sep, + journal={Nature}, + volume={585}, + number={7825}, + pages={357--362}, + doi={10.1038/s41586-020-2649-2}, + publisher={Springer Science and Business Media {LLC}}, + url={https://doi.org/10.1038/s41586-020-2649-2} +} + +@article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} +} + +@software{reback2020pandas, + author= {The pandas development team}, + title={pandas-dev/pandas: Pandas}, + month=feb, + year=2020, + publisher={Zenodo}, + version={latest}, + doi= {10.5281/zenodo.3509134}, + url= {https://doi.org/10.5281/zenodo.3509134} +} + +@article{Hoyer_xarray_N-D_labeled_2017, + author={Hoyer, Stephan and Joseph, Hamman}, + doi={10.5334/jors.148}, + journal={Journal of Open Research Software}, + month=apr, + number={1}, + title={{xarray: N-D labeled Arrays and Datasets in Python}}, + volume={5}, + year={2017} +} diff --git a/paper/paper.md b/paper/paper.md index 6215d43e..dc480dc0 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,34 +30,34 @@ bibliography: paper.bib # Summary -Understanding the interaction between humans and the Earth system is a -computationally daunting task, with many possible approaches depending on -resources available and questions of interest. For example, state of the art -impact models require decade-long time series of relatively high frequency, -spatially resolved and often multiple variables representing climatic impact- -drivers [@ruane2022climatic]. Most commonly these are derived from the outputs -of detailed, computationally expensive Earth System Models (ESMs) run according -to a standard, limited set of future scenarios, the latest being the SSP-RCPs -run under CMIP6-ScenarioMIP [@Eyringetal2016;@ONeilletal2016]. At the time of -writing, O'Neill et al has been cited more than 1750 times and Eyring et al -more than 5000 times, highlighting the broad, general use of this data. - - -Often, however, impact modeling seeks to explore new scenarios that were not +Understanding the interaction between humans and the Earth system is a +computationally daunting task, with many possible approaches depending on +resources available and questions of interest. For example, state-of-the-art +impact models require decade-long time series of relatively high frequency, +spatially resolved and often multiple variables representing climatic impact-drivers +[@ruane2022climatic]. Most commonly these are derived from the outputs +of detailed, computationally expensive Earth System Models (ESMs) run according +to a standard, limited set of future scenarios, the latest being the SSP-RCPs +run under CMIP6-ScenarioMIP [@Eyringetal2016;@ONeilletal2016]. At the time of +writing, O'Neill et al. has been cited more than 1750 times and Eyring et al. +more than 5000 times, highlighting the broad, general applications of this data. + + +Often, however, impact modeling seeks to explore new scenarios that were not part of the ScenarioMIP protocol, and/or needs a larger set of initial condition -ensemble members than are typically available to quantify the effects of ESM -internal variability. In addition, the recognition that the human and Earth +ensemble members than are typically available to quantify the effects of ESM +internal variability. In addition, the recognition that the human and Earth systems are fundamentally intertwined, and may feature potentially -significant feedback loops, is making integrated, simultaneous modeling of -the coupled human-Earth system increasingly necessary, if computationally -challenging with most existing tools [@thornton2017biospheric]. +significant feedback loops, is making integrated, simultaneous modeling of +the coupled human-Earth system increasingly necessary, if computationally +challenging with most existing tools [@thornton2017biospheric]. -For impact modelers, climate model emulators can be the answer to meet both -the needs of 1) creating realizations for novel scenarios and 2)achieving a -simplified, computationally tractable representation of ESM behavior in a -coupled human-Earth system modeling framework. We proposed a new, -comprehensive approach to such emulation of gridded, multivariate ESM +For impact modelers, climate model emulators can be the answer to meet both +the needs of: 1) creating realizations for novel scenarios and 2) achieving a +simplified, computationally tractable representation of ESM behavior in a +coupled human-Earth system modeling framework. We proposed a new, +comprehensive approach to such emulation of gridded, multivariate ESM outputs for novel scenarios without the computational cost of a full ESM, STITCHES [@tebaldi2022stitches]. The approach outlined in Tebaldi et al. should be extensible to future CMIP eras, although the `stitches` software at present @@ -69,13 +69,12 @@ The corresponding `stitches` Python package uses existing archives of ESMs’ scenario experiments from CMIP6/ScenarioMIP to construct gridded, multivariate realizations of new scenarios provided by reduced complexity climate models [@hartin2015simple;@meinshausen2011emulating;@smith2018fair], or to -enrich existing initial condition ensembles. Its output has the -same characteristics of the ESM output emulated: multivariate (spanning +enrich existing initial condition ensembles. Its output provides the same +characteristics as the emulated ESM output: multivariate (spanning potentially all variables that the ESM has saved), spatially resolved (down to -the native grid of the ESM), and as high frequency as the original output has -been saved at. A new realization of multiple variables can be generated on -the order of minutes with `stitches`, rather than the hours or sometimes days -that ESMs require. +the native grid of the ESM), and preserving the same high frequency as the original data. +A new realization of multiple variables can be generated on the order of minutes with +`stitches`, rather than the hours or sometimes days that ESMs require. @@ -86,86 +85,85 @@ characteristics of a particular ESM’s outputs for multiple variables and at time scales (often daily or monthly) relevant to impact models. Many existing ESM emulation methods, such as MESMER [@beusch2020emulating;@nath2022mesmer;@quilcaille2022showcasing], rely -on 'bottom up' methods, -inferring from the ESM outputs available for training the details of some -statistical process (or, more recently, a machine learning algorithm) able to -generate new realizations with the same spatiotemporal behavior of the original +on 'bottom up' methods, inferring from the ESM outputs available for training the +details of some statistical process (or, more recently, a machine learning algorithm) +able to generate new realizations with the same spatiotemporal behavior of the original ESM outputs, using as input in the generative phase only large scale information, -like global average temperature (GSAT), that can be generated by a reduced -complexity model, such as Hector, Magicc, or FAIR -[@hartin2015simple;@meinshausen2011emulating;@smith2018fair] +like global average temperature (GSAT), that can be generated by a reduced complexity +model, such as Hector, Magicc, or FAIR +[@hartin2015simple;@meinshausen2011emulating;@smith2018fair]. - -The STITCHES approach instead takes a top-down approach inspired by the warming- -level style of analyses used by past Intergovernmental Panel on Climate Change +The STITCHES approach instead takes a top-down approach inspired by the warming-level +style of analyses used by past Intergovernmental Panel on Climate Change reports [@SR15;@arias2021climate;@masson2021ipcc;@core2023ipcc]. Specifically, -`stitches` takes existing ESM output and intelligently recombines time windows -of these gridded, multivariate outputs into new instances of transient, 21st -century trajectories by stitching them together on the basis of a target GSAT -trajectory. The latter can represent an existing scenario (i.e., one that the -ESM has run) or a new one that a simple model can produce, as long as the latter -is intermediate to existing ones in forcing levels/GSAT. We encourage users to +`stitches` takes existing ESM output and intelligently recombines time windows +of these gridded, multivariate outputs into new instances of transient, 21st +century trajectories by stitching them together on the basis of a target GSAT +trajectory. The latter can represent an existing scenario (i.e., one that the +ESM has run) or a new one that a simple model can produce, as long as the latter +is intermediate to existing ones in forcing levels/GSAT. We encourage users to see the flowchart included in the `stitches` [quickstart notebook](https://github.com/JGCRI/stitches/blob/main/notebooks/stitches-quickstart.ipynb) -and [website](https://jgcri.github.io/stitches/), as well as in Tebaldi et al, -for a visual example of this process. Tebaldi et al of course contains full -details and more illustrative figures. +and [website](https://jgcri.github.io/stitches/), as well as in Tebaldi et al., for a visual example of this process. +Tebaldi et al. of course contains the full details as well as more illustrative figures. -Research from the climate science community has indicated that many ESM output -variables are tightly dependent upon the GSAT trajectory and thus scenario -independent (see [@SR15] and citations therein, in particular James et al. -[@james2017characterizing]), justifying our approach. Thus, the statistical -characteristics of ESM output are preserved by the construction process `stitches` -implements, as outlined in Tebaldi et al. One of the major benefits of this +Research from the climate science community has indicated that many ESM output +variables are tightly dependent upon the GSAT trajectory and thus scenario +independent (see [@SR15] and citations therein, in particular James et al. +[@james2017characterizing]), justifying our approach. Thus, the statistical +characteristics of ESM output are preserved by the construction process `stitches` +implements, as outlined in Tebaldi et al. One of the major benefits of this top-down approach is that it jointly emulates outputs of multiple ESM variables, -maintaining by construction the joint behavior of the original ESM output, -something not presently available in other packages to our knowledge. Most -impact-relevant atmospheric variables such as temperature, precipitation, relative -humidity, and sea level pressure can be emulated by `stitches` as they are -scenario-independent and have a short memory (compared to the window used by -‘stitches’, presently set to 9-years). Any variable that the ESM has archived can -be emulated jointly. Variables that represent the cumulative effect of warming, -such as sea level rise, or that have a long memory, like glacier mass loss or -mega-drought, cannot be emulated with `stitches`. `stitches` can produce new -realizations for variables archived by the ESM, but it can produce only finitely -many new realizations, the maximum number depending on the number of runs -archived by each ESM. Currently, new realizations from `stitches` can be +maintaining by construction the joint behavior of the original ESM output, +something not presently available in other packages to our knowledge. Most +impact-relevant atmospheric variables such as temperature, precipitation, relative +humidity, and sea level pressure can be emulated by `stitches` as they are +scenario-independent and have a short memory (compared to the window used by +‘stitches’, presently set to nine (9) years). Any variable that the ESM has archived can +be emulated jointly. Variables that represent the cumulative effect of warming, +such as sea level rise, or that have a long memory, like glacier mass loss or +mega-drought, cannot be emulated with `stitches`. `stitches` can produce new +realizations for variables archived by the ESM, but it can produce only finitely +many new realizations, the maximum number depending on the number of runs +archived by each ESM. Currently, new realizations from `stitches` can be appended to archived ESM realizations to result in double to triple the number -of runs available, this is arguably one of the main differences from the above- -mentioned bottom-up approaches, which can generate infinite new realizations -once an accurate statistical process is estimated from existing data. We see +of runs available; this is arguably one of the main differences from the above-mentioned +bottom-up approaches, which can generate infinite new realizations +once an accurate statistical process is estimated from existing data. We see this as a source of complementarity between these two emulation approaches. The `stitches` Python package currently relies on close integration with the -Pangeo cloud catalog of CMIP6 ESM outputs -(https://gallery.pangeo.io/repos/pangeo-gallery/cmip6/). Thanks to this -integration, users are not required to pre-download the entire -CMIP6-ScenarioMIP archive of ESM outputs, and can quickly and flexibly -emulate variables from any of the 40 ESMs participating in ScenarioMIP. +Pangeo Cloud catalog of CMIP6 ESM outputs (https://gallery.pangeo.io/repos/pangeo-gallery/cmip6/). +Thanks to this integration, users are not required to pre-download the entire +CMIP6-ScenarioMIP archive of ESM outputs, and can quickly and flexibly +emulate variables from any of the 40 ESMs participating in ScenarioMIP. In addition to the requirements for working with Pangeo in Python, `stitches` -relies only on a few common scientific Python packages (`xarray`, `numpy`, -`pandas`, `sk-learn`), which are specified required dependencies in the package. -Finally, because `stitches` is intended for use by impact modelers, the new -realizations generated by `stitches` are NetCDF files with the same dimension +relies only on a few common scientific Python packages, namely `xarray`, `numpy`, +`pandas`, `scikit-learn` +[@Hoyer_xarray_N-D_labeled_2017;@harris2020array;@reback2020pandas;@scikit-learn], +which are specified required dependencies in the package. +Finally, because `stitches` is intended for use by impact modelers, the new +realizations generated by `stitches` are NetCDF files with the same dimension information and generally identical structure to the original CMIP6 ESM outputs. -These outputs from `stitches` can then serve as inputs to impact models with -little to no code changes in the impact models. It may also be possible to -endogenize climate impacts in scenario construction by coupling `stitches` -with impact models for multiple sectors and a reduced complexity climate model -such as Hector, Magicc, or FAIR -[@hartin2015simple;@meinshausen2011emulating;@smith2018fair]. With the -computational efficiency of using emulators, +These outputs from `stitches` can then serve as inputs to impact models with +little to no code changes in the impact models. It may also be possible to +endogenize climate impacts in scenario construction by coupling `stitches` +with impact models for multiple sectors and a reduced complexity climate model +such as Hector, MAGICC, or FAIR +[@hartin2015simple;@meinshausen2011emulating;@smith2018fair]. +With the computational efficiency of using emulators, it may be possible to interactively develop new scenarios with more insight than -would be possible using multimodel ESM ensemble statistics or using off-the-shelf -ESM scenarios alone. +would otherwise be possible using multimodel ESM ensemble statistics or using +off-the-shelf ESM scenarios alone. # Code availability + The `stitches` GitHub repository (https://github.com/JGCRI/stitches) provides -installation instructions. +installation instructions. Also included is a [quickstart notebook](https://github.com/JGCRI/stitches/blob/main/notebooks/stitches-quickstart.ipynb) that serves as a tutorial for using the package.