mggg · gabeschoenbach · Aug 4, 2021 · Jul 15, 2021 · Jul 15, 2021 · Jul 15, 2021
diff --git a/README.md b/README.md
@@ -1,15 +1,31 @@
 # PyEI
 
-PyEI is a Python library for ecological inference. It is new and under active development, so expect rough edges and bugs -- and for additional features and documentation to be coming quickly!
+PyEI is a Python library for ecological inference.
+
+An important question in some voting rights and redistricting litigation in the U.S. is whether and to what degree voting is racially polarized.
+In the setting of voting rights cases, the family of methods called "ecological inference" uses
+observed data, pairing voting outcomes with demographic information
+for each precinct in a given polity, to infer voting patterns for each demographic group. 
+
+PyEI brings together a variety of ecological inference methods in one place and facilitates reporting and plotting results; quantifying the uncertainty associated with results under a given model; making comparisons between methods; and bringing relevant diagnostic tools to bear on ecological inference methods.
+
+PyEI is relatively new and under active development, so expect rough edges and bugs -- and for additional features and documentation to be coming quickly!
 
 ## Want to use PyEI? Start here.
 
 ### Installation
-You can install with pip:
+You can install the latest release from `PyPi` with:
 
 ```
-pip install git+git://github.com/mggg/ecological-inference.git
+pip install pyei
 ```
+
+Or, install directly from GitHub for the most up-to-date (but potentially less stable) version:
+
+```
+pip install git+git://github.com/mggg/ecological-inference.git
+ ```
+
 ### Example notebooks
 
 Check out the [intro notebooks](https://github.com/mggg/ecological-inference/tree/main/pyei/intro_notebooks) and [example notebooks](https://github.com/mggg/ecological-inference/tree/main/pyei/examples) for sample code
@@ -53,13 +69,8 @@ python -m pip install -r requirements-dev.txt  # install dev requirements
 ### Install with conda
 
 ```bash
-conda create --name pyei python=3.8  # create conda env with python 3.8
-conda activate pyei                 # activate conda env
-# See requirements.txt and requirements-dev.txt
-conda install pip
-pip install --upgrade -r requirements.txt
-pip install --upgrade -r requirements-dev.txt
-pip install -e . 
+conda create --name pyei --channel conda-forge python=3.8 --file requirements.txt --file requirements-dev.txt # create conda environment and install requirements
+pip install -e . #install project locally
 ```
 
 ### Testing

diff --git a/paper/paper.bib b/paper/paper.bib
@@ -8,7 +8,7 @@ @article{goodman1953ecological
 }
 
 @article{greiner2009r,
-  title={R$\times$ C ecological inference: bounds, correlations, flexibility and transparency of assumptions},
+  title={{R}$\times$ {C} ecological inference: bounds, correlations, flexibility and transparency of assumptions},
   author={James Greiner, D and Quinn, Kevin M},
   journal={Journal of the Royal Statistical Society: Series A (Statistics in Society)},
   volume={172},
@@ -48,7 +48,7 @@ @article{wakefield2004ecological
 }
 
 @article{rosen2001bayesian,
-  title={Bayesian and frequentist inference for ecological inference: The R$\times$ C case},
+  title={Bayesian and frequentist inference for ecological inference: The {R}$\times$ {C} case},
   author={Rosen, Ori and Jiang, Wenxin and King, Gary and Tanner, Martin A},
   journal={Statistica Neerlandica},
   volume={55},
@@ -67,12 +67,12 @@ @article{arviz_2019
     number = {33},
     pages = {1143},
     author = {Ravin Kumar and Colin Carroll and Ari Hartikainen and Osvaldo Martin},
-    title = {ArviZ a unified library for exploratory analysis of Bayesian models in Python},
+    title = {{ArviZ} a unified library for exploratory analysis of {Bayesian} models in {Python}},
     journal = {Journal of Open Source Software}
 }
 
 @article{salvatier2016probabilistic,
-  title={Probabilistic programming in Python using PyMC3},
+  title={Probabilistic programming in {Python} using PyMC3},
   author={Salvatier, John and Wiecki, Thomas V and Fonnesbeck, Christopher},
   journal={PeerJ Computer Science},
   volume={2},
@@ -82,7 +82,7 @@ @article{salvatier2016probabilistic
 }
 
 @Manual{eiCompare,
-    title = {eiCompare: Compares Ecological Inference, Goodman, Rows by Columns
+    title = {{eiCompare}: Compares Ecological Inference, {Goodman}, Rows by Columns
 Estimates},
     author = {Loren Collingwood and Ari Decter-Frain and Hikari Murayama and Pratik Sachdeva and Juandalyn Burke},
     year = {2020},
@@ -91,15 +91,15 @@ @Manual{eiCompare
   }
 
   @Manual{ei,
-    title = {ei: Ecological Inference},
+    title = {{ei}: Ecological Inference},
     author = {Gary King and Molly Roberts},
     year = {2016},
     note = {R package version 1.3-3},
     url = {https://CRAN.R-project.org/package=ei},
   }
 
     @Manual{eiPack,
-    title = {eiPack: Ecological Inference and Higher-Dimension Data Management},
+    title = {{eiPack}: Ecological Inference and Higher-Dimension Data Management},
     author = {Olivia Lau and Ryan T. Moore and Michael Kellermann},
     year = {2020},
     note = {R package version 0.2-1},
@@ -115,7 +115,7 @@ @book{elliot2000spatial
 }
 
     @Manual{RxCEcolInf,
-    title = {RxCEcolInf: 'R x C Ecological Inference With Optional Incorporation of
+    title = {RxCEcolInf: '{R} x {C} Ecological Inference With Optional Incorporation of
 Survey Information'},
     author = {D. James Greiner and Paul Baines and Kevin M. Quinn},
     year = {2019},
@@ -124,7 +124,7 @@ @Manual{RxCEcolInf
   }
 
     @article{hoffman2014no,
-  title={The No-U-Turn sampler: adaptively setting path lengths in Hamiltonian Monte Carlo.},
+  title={The {No-U-Turn} sampler: adaptively setting path lengths in {Hamiltonian} {Monte} {Carlo}.},
   author={Hoffman, Matthew D and Gelman, Andrew},
   journal={J. Mach. Learn. Res.},
   volume={15},
@@ -142,15 +142,23 @@ @book{BDA3
 
 @article{neal2011mcmc,
   doi={10.1201/b10905-7},
-  title={MCMC using Hamiltonian dynamics},
+  title={{MCMC} using {Hamiltonian} dynamics},
   author={Neal, Radford},
-  journal={Handbook of markov chain monte carlo},
+  journal={Handbook of {Markov} chain {Monte} {Carlo}},
   volume={2},
   number={11},
   pages={2},
   year={2011}
 }
 
+@misc{betancourt2018conceptual,
+      title={A Conceptual Introduction to Hamiltonian Monte Carlo}, 
+      author={Michael Betancourt},
+      year={2018},
+      eprint={1701.02434},
+      archivePrefix={arXiv},
+      primaryClass={stat.ME}
+}
 
 
 
diff --git a/paper/paper.md b/paper/paper.md
@@ -24,17 +24,26 @@ date: 10 May 2021
 bibliography: paper.bib
 ---
 
+
 # Summary
 
 An important question in some voting rights and redistricting litigation in the U.S. is whether and to what degree voting is racially polarized.
-In the setting of voting rights cases, there is a family of methods called "ecological inference" (see especially [@king1997solution]) that uses
+In the setting of voting rights cases, there is a family of methods called "ecological inference" [see especially @king1997solution] that uses
 observed data, pairing voting outcomes with demographic information
 for each precinct in a given polity, to infer voting patterns for each demographic group.
 
 More generally, we can think of ecological inference as seeking to use knowledge about the margins of a set of tables (\autoref{fig:table_ex}) to infer associations between the row and column variables, by making (typically probabilistic) assumptions. In the context of assessing racially polarized voting, a table like the one in \autoref{fig:table_ex} will correspond to a precinct, where each column corresponds to a candidate or voting outcome and each row to a racial group. Ecological inference methods then use the vote counts and demographic data for each precinct to make inferences about the overall voting preferences by demographic group, thus addressing questions like: "What percentage of East Asian voters voted for Hardy?". This example is an instance of what is referred to in the literature as "R by C" ecological inference, where here we have R $=$ 2 groups and C $=$ 3 voting outcomes.
 `PyEI` was created to support performing ecological inference with voting data; however, ecological inference methods also applicable in other fields, such as epidemiology [@elliot2000spatial] and sociology [@goodman1953ecological].
 
-![In ecological inference we have information about the marginal counts for a set of tables like the one above and would like to make inferences about, for example, the number or proportion of East Asian voters who voted for Hardy. The system is underdetermined and ecological inference methods proceed by making statistical assumptions. \label{fig:table_ex}](figs/table_ex2.png){ width=70% }
+\newpage
+|                 | Hardy         | Kolstad        | Nadeem |   |
+| :-------------  | :-----------: | :------------: | :-------------: | :------------- |
+| East Asian      | ?             |  ?                    | ? | Total East Asian |
+| non- East Asian | ?     | ?              | ? | Total non- East Asian |
+|                 | Total for    | Total for   | Total for |  |
+|                 |  Hardy     | Kolstad   |  Nadeem |  |
+
+Table: In ecological inference we have information about the marginal counts for a set of tables like the one here and would like to make inferences about, for example, the number or proportion of East Asian voters who voted for Hardy. The system is underdetermined and ecological inference methods proceed by making statistical assumptions. \label{fig:table_ex}
 
 # Statement of need
 
@@ -58,39 +67,35 @@ ecological inference methods in different settings and/or develop new statistica
 - Dirichlet-Multinomial hierarchical models [@rosen2001bayesian]
 - A Bayesian hierarchical method for ${2 \times 2}$ EI following the approach of @wakefield2004ecological
 
-(In several of these cases, `PyEI` includes modifications to the models as originally proposed in the cited literature, such as reparametrizations or other changes to upper levels of the hierarchical models in order to ease sampling difficulties.)
+In several of these cases, `PyEI` includes modifications to the models as originally proposed in the cited literature, such as reparametrizations or other changes to upper levels of the hierarchical models in order to ease sampling difficulties.
 
 `PyEI` is intended to be easily extensible, so that additional methods from the literature can continue to be incorporated (for example, work is underway to add the method of @greiner2009r, currently implemented in the R package `RxCEcolInf` [@RxCEcolInf]). Newly developed statistical methods for ecological inference can be included and conveniently compared with existing methods.
 
 Several R libraries implementing different ecological inference methods exist, such as `eiPack` [@eiPack], `RxCEcolInf` [@RxCEcolInf], `ei` [@ei], and `eiCompare` [@eiCompare]. In addition to presenting a Python-based option that researchers who primarily use Python may appreciate, `PyEI` 
 incorporates the following key features and characteristics.
 
-First, the Bayesian hierarchical methods implemented in `PyEI` rest on modern probabilistic programming tooling [@salvatier2016probabilistic] and gradient-based MCMC methods such as the No U-Turn Sampler (NUTS) [@hoffman2014no]. Using NUTS where possible should allow for faster convergence than existing implementations that rest primarily on Metropolis-Hastings and Gibbs sampling steps. Consider effective sample size, which is a measure of how the variance of the mean of drawn samples compare to the variance of i.i.d. samples from the posterior distribution (or, very roughly, how “effective” the samples are for computing the posterior mean, compared to i.i.d. samples) [@BDA3]. In Metropolis-Hastings, the number of evaluations of the log-posterior required for a given effective sample size scales linearly with the dimensionality of the parameter space, while in Hamiltonian Monte Carlo approaches such as NUTS, the number of required evaluations of the gradient of the log-posterior scales only as the fourth root of the dimension [@neal2011mcmc]. Reasonable scaling with the dimensionality of the parameter space is important in ecological inference, as that dimensionality is large when there are many precincts.
+First, the Bayesian hierarchical methods implemented in `PyEI` rest on modern probabilistic programming tooling [@salvatier2016probabilistic] and gradient-based MCMC methods such as the No U-Turn Sampler (NUTS) [@hoffman2014no; @betancourt2018conceptual]. Using NUTS where possible should allow for faster convergence than existing implementations that rest primarily on Metropolis-Hastings and Gibbs sampling steps. Consider effective sample size, which is a measure of how the variance of a Monte Carlo estimate of a posterior expectation computed from dependent samples compares to the variance of the corresponding estimate computed from independent samples from the posterior distribution (or, very roughly, how “effective” the samples are for estimating a posterior expectation, compared to independent samples) [@BDA3]. Under certain assumptions on the target posterior distribution, in Metropolis-Hastings the number of evaluations of the log-posterior required for a given effective sample size scales linearly with the dimensionality of the parameter space, while in Hamiltonian Monte Carlo approaches such as NUTS, the number of required evaluations of the gradient of the log-posterior scales only as the fourth root of the dimension [@neal2011mcmc]. Reasonable scaling with the dimensionality of the parameter space is important in ecological inference, as that dimensionality is large when there are many precincts.
 
 Second, integration with the existing tools `PyMC3` [@salvatier2016probabilistic] and `ArviZ` [@arviz_2019] makes the results amenable to state of the art diagnostics (e.g. convergence diagnostics) and some reasonable checks are automatically performed. 
 
-Third, summary and plotting utilities for reporting, visualizing, and comparing results are included (see example plots below), with an emphasis on visualizations and reports that clarify the uncertainty of estimates under a model.
+Third, summary and plotting utilities for reporting, visualizing, and comparing results are included (e.g. \autoref{fig:kdes}, \autoref{fig:polarization}), with an emphasis on visualizations and reports that clarify the uncertainty of estimates under a model.
 
 Lastly, clear documentation is provided, including a set of introductory and example notebooks.
 
-# Acknowledgments
-
-This software development is part of a research project comparing methods, joint with Moon Duchin and Thomas Weighill. We thank Colin Carroll, JN Matthews, and Matthew Sun for their helpful contributions to `PyEI`. 
+![Kernel density estimation plots for visualizing uncertainty of support for candidates within each group.\label{fig:kdes}](figs/figure2.png){ width=100% } 
 
-\newpage
-# Examples of plotting functionality
+![Visualizing and quantifying degree of polarization.\label{fig:polarization}](figs/figure4.png){ width=100% }
 
-![KDE plots for visualizing uncertainty of support for candidates within each group.\label{fig:kdes}](figs/figure2.png){ width=100% } 
+# Acknowledgments
 
-![Bayesian credible intervals for support of candidates within groups.\label{fig:credible_interval}](figs/figure3.png){ width=100% }
+This software development is part of a research project comparing methods, joint with Moon Duchin and Thomas Weighill. We thank Colin Carroll, JN Matthews, and Matthew Sun for their helpful contributions to `PyEI`. 
 
-![Visualizing and quantifying degree of polarization.\label{fig:polarization}](figs/figure4.png){ width=100% }
 
-![Visualizing estimates and uncertainty for precinct-level estimates.\label{fig:precinct_level}](figs/figure5.png){ width=50% }
+<!-- ![Bayesian credible intervals for support of candidates within groups.\label{fig:credible_interval}](figs/figure3.png){ width=100% } -->
 
-![Tomography plots for two-by-two ecological inference.\label{fig:tomography}](figs/figure6.png){ width=40% }
+<!-- ![Visualizing estimates and uncertainty for precinct-level estimates.\label{fig:precinct_level}](figs/figure5.png){ width=50% }
 
-\newpage
+![Tomography plots for two-by-two ecological inference.\label{fig:tomography}](figs/figure6.png){ width=40% } -->
 
 # References
 
diff --git a/paper/paper.pdf b/paper/paper.pdf
diff --git a/pyei/goodmans_er.py b/pyei/goodmans_er.py
@@ -19,6 +19,13 @@ class GoodmansER:
     """
 
     def __init__(self, is_weighted_regression=False):
+        """
+        Parameters
+        ----------
+        is_weighted_regression: bool, optional
+            Default is False. If true, weight precincts by population when
+            performing the regression.
+        """
         self.demographic_group_fraction = None
         self.vote_fraction = None
         self.demographic_group_fraction = None
@@ -113,7 +120,20 @@ class GoodmansERBayes(TwoByTwoEIBaseBayes):
     Generate samples from the posterior.
     """
 
-    def __init__(self, model_name, weighted_by_pop=False, **additional_model_params):
+    def __init__(
+        self, model_name="goodman_er_bayes", weighted_by_pop=False, **additional_model_params
+    ):
+        """
+        Optional arguments:
+        model_name: str
+            Default is "goodman_er_bayes"
+        weighted_by_pop: bool
+            Default is False. If true, weight precincts by population when
+            performing the regression.
+        additional_model_parameters:
+            Any hyperparameters for model
+        """
+        # TODO if no other model name is applicable here, remove need for model_name argument
         super().__init__(model_name, **additional_model_params)
         self.weighted_by_pop = weighted_by_pop
 
@@ -170,7 +190,7 @@ def fit(
             group_fraction, votes_fraction, **self.additional_model_params
         )
 
-        with self.sim_model:
+        with self.sim_model:  # pylint: disable=not-context-manager
             self.sim_trace = pm.sample(
                 1000, tune=tune, target_accept=target_accept, **other_sampling_args
             )
@@ -189,8 +209,17 @@ def calculate_sampled_voting_prefs(self):
         )  # sampled voted prefs across precincts
 
     def compute_credible_int_for_line(self, x_vals=np.linspace(0, 1, 100)):
-        """Computes regression line (mean) and 95% credible interval for the
-        mean line at each of the specified x values(x_vals)
+        """Computes regression line (mean) and 95% central credible interval for
+        the mean line at each of the specified x values(x_vals)
+
+        Parameters
+        ----------
+        x_vals : numpy array, optional
+            Default: np.linspace(0, 1, 100). 1-dimensional numpy array of values between
+            0 and 1, for which the function should compute the mean line and 95%
+            credible interval. Each element of x_vals represents the fraction of the population
+            that is in the demographic group of interest (values for X in the notation of King '97)
+
         """
         lower_bounds = np.empty_like(x_vals)
         upper_bounds = np.empty_like(x_vals)