From 98f18aa343bed9f91d07bc61a96e799acf7f39e0 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sat, 16 Mar 2024 12:03:14 -0400
Subject: [PATCH 01/18] v2.2 update

---
 delicatessen/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/delicatessen/version.py b/delicatessen/version.py
index ab02866..d3c8880 100644
--- a/delicatessen/version.py
+++ b/delicatessen/version.py
@@ -1 +1 @@
-__version__ = "2.1"
+__version__ = "2.2"

From d924ff61d81139815c92d7ae435414444f2d9022 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:00:15 -0400
Subject: [PATCH 02/18] adding by-hand approx deriv

---
 delicatessen/derivative.py | 130 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 128 insertions(+), 2 deletions(-)

diff --git a/delicatessen/derivative.py b/delicatessen/derivative.py
index c08c3bc..6fe5b12 100644
--- a/delicatessen/derivative.py
+++ b/delicatessen/derivative.py
@@ -4,6 +4,132 @@
 from scipy.stats import norm
 
 
+def approx_differentiation(xk, f, epsilon=1e-9, method='capprox'):
+    r"""Numerical approximation to compute the gradient. This function implements numerical approximation methods for
+    derivatives generally (i.e., it provides the first-order forward, backward, and central difference approximations).
+
+    Note
+    ----
+    This functionality is only intended for use behind the scenes in ``delicatessen``. Numerical approximation is
+    implemented from scratch to offer backward and central difference approximations (SciPy's ``approx_fprime`` only
+    offers the forward difference).
+
+
+    The forward difference approximation is
+
+    .. math::
+
+        \frac{f(x + \epsilon) - f(x)}{\epsilon}
+
+    the backward difference approximation is
+
+    .. math::
+
+        \frac{f(x) - f(x - \epsilon)}{\epsilon}
+
+    and the central difference approximation is
+
+    .. math::
+
+        \frac{f(x + \epsilon) - f(x - \epsilon)}{2\epsilon}
+
+    Here, the numerical approximation is implemented by generating matrices for output from a function evaluated under
+    minor perturbations (determined by ``epsilon``) of each input argument. These matrices are then subtracted from
+    each other and then scaled by ``epsilon``.
+
+    Parameters
+    ----------
+    xk : ndarray, list, shape (n, )
+        Point(s) or coordinate vector to evaluate the gradient at.
+    f : callable
+        Function of which to estimate the gradient of.
+    epsilon : float, optional
+        Increment to perturb the points by to compute the gradient. This should be a small value
+    method : str, optional
+        Approximation to use to compute the gradient. Default is `capprox` which uses the central difference method.
+        One can also specify the forward difference (`fapprox`) or backward difference (`bapprox`) methods.
+
+    Returns
+    -------
+    numpy.array :
+        Corresponding array of the pairwise derivatives for all different input x values.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from delicatessen.derivative import approx_differentiation
+
+    To illustrate use, we will compute the derivative of the following function
+
+    .. math::
+
+        f(x) = x^2 - x^1 + sin(x + \sqrt{x})
+
+    >>> def f(x):
+    >>>     return x**2 - x + np.sin(x + np.sqrt(x))
+
+    If you work out the deriative by-hand, you will end up with the following
+
+    .. math::
+
+        2x - 1 + \left( \frac{1}{2 \sqrt{x}} + 1 \right) \cos(x + \sqrt{x})
+
+    Instead, we can use the central difference approximation to evaluate the derivative at a specific point. Here, we
+    will evaluate the derivative at :math:`x=1`
+
+    >>> dy = approx_differentiation(xk=[1, ], f=f)
+
+    which returns ``0.37578``, which is close to plugging in :math:`x=1` into the previous equation.
+
+    The derivative of a function with multiple inputs and multiple outputs can also be evaluated. Consider the following
+    example with three inputs and two outputs
+
+    >>> def f(x):
+    >>>     return [x[0]**2 - x[1], np.sin(np.sqrt(x[1]) + x[2]) + x[2]*(x[1]**2)]
+
+    >>> dy = approx_differentiation(xk=[0.7, 1.2, -0.9], f=f)
+
+    which will return a 2-by-3 array of all the x-y pair derivatives at the given values. Here, the rows correspond to
+    the output and the columns correspond to the inputs.
+    """
+    # Setup parameters for call
+    xk = np.asarray(xk)
+    xp = xk.shape[0]
+    shift = np.identity(n=xk.shape[0]) * epsilon
+
+    def generate_matrix(x_shift, f):
+        """Internal function to generate a matrix of the outputs under the parameter shifts, defined by x_shift"""
+        shift_matrix = []
+        for j in range(xp):
+            shift_matrix.append(f(x_shift[j, :]))
+        return np.asarray(shift_matrix)
+
+    # Computing the gradient using the corresponding method
+    if method == 'capprox':
+        lower = (xk - shift)
+        upper = (xk + shift)
+        f0 = generate_matrix(x_shift=lower, f=f)
+        f1 = generate_matrix(x_shift=upper, f=f)
+        deriv = (f1 - f0).T / (2*epsilon)
+    elif method == 'fapprox':
+        lower = (xk - shift)
+        f0 = generate_matrix(x_shift=lower, f=f)
+        f_eval = f(xk)
+        f1 = np.asarray([f_eval for i in range(xp)])
+        deriv = (f1 - f0).T / epsilon
+    elif method == 'bapprox':
+        f_eval = f(xk)
+        f0 = np.asarray([f_eval for i in range(xp)])
+        upper = (xk + shift)
+        f1 = generate_matrix(x_shift=upper, f=f)
+        deriv = (f1 - f0).T / epsilon
+    else:
+        raise ValueError("Method chosen is not supported")
+
+    # Processing the final return based on parameter shape
+    return deriv
+
+
 def auto_differentiation(xk, f):
     r"""Forward-mode automatic differentiation. Automatic differentiation offers a way to compute the exact derivative,
     rather than numerically approximated (i.e., the central difference method). Automatic differentiation iteratively
@@ -22,9 +148,9 @@ def auto_differentiation(xk, f):
 
     Parameters
     ----------
-    xk: ndarray, list, shape (n, )
+    xk : ndarray, list, shape (n, )
         Point(s) or coordinate vector to evaluate the gradient at.
-    f: callable
+    f : callable
         Function of which to estimate the gradient of.
 
     Returns

From 6653608e0250b7e40d231cda65f2cd28a2ecdcbc Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:00:45 -0400
Subject: [PATCH 03/18] revising how the sandwich is computed

---
 delicatessen/__init__.py    |   1 +
 delicatessen/mestimation.py | 104 ++++-------------------
 delicatessen/sandwich.py    | 164 ++++++++++++++++++++++++++++++++++++
 3 files changed, 182 insertions(+), 87 deletions(-)
 create mode 100644 delicatessen/sandwich.py

diff --git a/delicatessen/__init__.py b/delicatessen/__init__.py
index 550d3db..9e86c51 100644
--- a/delicatessen/__init__.py
+++ b/delicatessen/__init__.py
@@ -12,3 +12,4 @@
 from .version import __version__
 
 from .mestimation import MEstimator
+from .sandwich import compute_sandwich
diff --git a/delicatessen/mestimation.py b/delicatessen/mestimation.py
index a5a7f07..a5974e2 100644
--- a/delicatessen/mestimation.py
+++ b/delicatessen/mestimation.py
@@ -1,12 +1,8 @@
-import warnings
-
 import numpy as np
 from scipy.optimize import newton, root
-from scipy.misc import derivative
-from scipy.optimize import approx_fprime
 from scipy.stats import norm
 
-from delicatessen.derivative import auto_differentiation
+from delicatessen.sandwich import compute_bread, compute_meat, build_sandwich
 
 
 class MEstimator:
@@ -292,32 +288,25 @@ def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='appr
             for s, n in zip(self._subset_, slv_theta):   # ... then look over the subset and input theta
                 self.theta[s] = n                        # ... and update the subset to the output/solved theta
 
-        # STEP 2: calculating Variance
+        # STEP 2: calculating the sandwich variance
+        # After solving for the parameters, we now can compute the empirical sandwich variance estimator. This is
+        #   done by compute the bread and meat matrices and then combining them. This is now done by a separate
+        #   functionality within `sandwich.py`.
         # STEP 2.1: baking the Bread
-        self.bread = self._bread_matrix_(theta=self.theta,                           # Provide theta-hat
-                                         method=deriv_method,                        # Method to use
-                                         dx=dx) / self.n_obs                         # Derivative approximation value
+        self.bread = compute_bread(stacked_equations=self.stacked_equations,
+                                   theta=self.theta,
+                                   deriv_method=deriv_method,
+                                   dx=dx) / self.n_obs
 
         # STEP 2.2: slicing the meat
-        evald_theta = np.asarray(self.stacked_equations(theta=self.theta))           # Evaluating EE at theta-hat
-        self.meat = np.dot(evald_theta, evald_theta.T) / self.n_obs                  # Meat is dot product of arrays
+        self.meat = compute_meat(stacked_equations=self.stacked_equations,
+                                 theta=self.theta) / self.n_obs
 
         # STEP 2.3: assembling the sandwich (variance)
-        if np.isnan(self.bread).any():
-            warnings.warn("The bread matrix contains at least one np.nan, so it cannot be inverted. The variance will "
-                          "not be calculated. This may be an issue with the provided estimating equations or the "
-                          "evaluated theta.",
-                          UserWarning)
-        else:
-            if allow_pinv:                                                               # Support 1D theta-hat
-                bread_invert = np.linalg.pinv(self.bread)                                # ... find pseudo-inverse
-            else:                                                                        # Support 1D theta-hat
-                bread_invert = np.linalg.inv(self.bread)                                 # ... find inverse
-            sandwich = np.dot(np.dot(bread_invert, self.meat), bread_invert.T)           # Compute sandwich
-
-            # STEP 3: updating storage for results
-            self.asymptotic_variance = sandwich       # Asymptotic variance requires division by n (done above)
-            self.variance = sandwich / self.n_obs     # Variance estimate requires division by n^2 (second done here)
+        self.asymptotic_variance = build_sandwich(bread=self.bread,
+                                                  meat=self.meat,
+                                                  allow_pinv=allow_pinv)
+        self.variance = self.asymptotic_variance / self.n_obs
 
     def confidence_intervals(self, alpha=0.05):
         r"""Calculate Wald-type :math:`(1 - \alpha) \times` 100% confidence intervals using the point estimates and
@@ -344,7 +333,7 @@ def confidence_intervals(self, alpha=0.05):
             intervals for :math:`\theta_b`
         """
         # Check that estimate() has been called
-        if self.variance is None:
+        if self.variance is None or np.isnan(self.variance):
             raise ValueError("Either theta has not been estimated yet, or there is a np.nan in the bread matrix. "
                              "Therefore, confidence_intervals() cannot be called.")
         # Check valid alpha value is being provided
@@ -383,7 +372,7 @@ def z_scores(self, null=0):
             Array of Z-scores for :math:`\theta_1, ..., \theta_b`, respectively
         """
         # Check that self.estimate() has been called
-        if self.theta is None:
+        if self.variance is None or np.isnan(self.variance):
             raise ValueError("Either theta has not been estimated yet, or there is a np.nan in the bread matrix. "
                              "Therefore, z_scores() cannot be called.")
 
@@ -480,25 +469,6 @@ def _mestimation_answer_(self, theta):
         return self._mestimator_sum_(stacked_equations=stacked_equations,   # Passing to evaluating function
                                      subset=self._subset_)                  # ... with specified subset
 
-    def _mestimation_answer_no_subset_(self, theta):
-        """Internal function to evaluate the sum of the estimating equations. The summation is internally evaluated
-        since access to the estimating functions is needed for the sandwich variance computations. This function is
-        used by the bread matrix computation procedure (since subset is ignored for the bread).
-
-        Parameters
-        ----------
-        theta : array
-            b-by-n matrix to sum over the values of n.
-
-        Returns
-        -------
-        array :
-            b-by-1 array, which is the sum over n for each b.
-        """
-        stacked_equations = np.asarray(self.stacked_equations(theta))       # Returning stacked equation
-        return self._mestimator_sum_(stacked_equations=stacked_equations,   # Passing to evaluating function
-                                     subset=None)                           # ... with always /no/ subset
-
     @staticmethod
     def _mestimator_sum_(stacked_equations, subset):
         """Function to evaluate the sum of the M-estimator over the :math:`n` units.
@@ -614,43 +584,3 @@ def _solve_coefficients_(stacked_equations, init, method, maxiter, tolerance):
 
         # Return optimized theta array
         return psi
-
-    def _bread_matrix_(self, theta, method, dx):
-        """Evaluate the bread matrix by taking all partial derivatives of the thetas in the estimating equation.
-
-        Parameters
-        ----------
-        theta : ndarray, float
-            Solved values of theta to evaluate at
-        dx : float
-            Spacing to use to numerically approximate the partial derivatives of the bread matrix.
-
-        Returns
-        -------
-        numpy.array
-        """
-        val_range = len(theta)                                       # Check how many values of theta there is
-        est_eq = self._mestimation_answer_no_subset_                 # Estimating equations to compute derivative of
-
-        # Compute the derivative
-        if method.lower() == "approx":                               # Numerical approximation method
-            if val_range == 1:                                       # When only a single theta is present
-                d = derivative(est_eq,                               # ... approximate the derivative
-                               theta, dx=dx)                         # ... at the solved theta (input)
-                bread_matrix = np.array([[d, ], ])                   # ... return as 1-by-1 array object for inversion
-            else:                                                    # Otherwise approximate the partial derivatives
-                bread_matrix = approx_fprime(xk=theta,               # ... use built-in jacobian functionality of SciPy
-                                             f=est_eq,               # ... with not-subset estimating equations
-                                             epsilon=dx)             # ... order option removed in v1.0
-
-        elif method.lower() == "exact":                              # Automatic Differentiation
-            bread_matrix = auto_differentiation(xk=theta,            # Compute the exact derivative at theta
-                                                f=est_eq)            # ... for the given estimating equations
-
-        else:                                                        # Error for unsupported option
-            raise ValueError("The input for deriv_method was "
-                             + str(method)
-                             + ", but only 'approx' and 'exact' are available.")
-
-        # Return bread (multiplied by negative 1 as in Stefanski & Boos)
-        return -1 * bread_matrix
diff --git a/delicatessen/sandwich.py b/delicatessen/sandwich.py
new file mode 100644
index 0000000..cf99de1
--- /dev/null
+++ b/delicatessen/sandwich.py
@@ -0,0 +1,164 @@
+import warnings
+
+import numpy as np
+from scipy.optimize import approx_fprime
+
+from delicatessen.derivative import auto_differentiation, approx_differentiation
+
+
+def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, allow_pinv=True):
+    """Compute the empirical sandwich variance estimator given [...] and parameter estimates.
+
+    Parameters
+    ----------
+    stacked_equations : function, callable
+        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        documentation for how to construct a set of estimating equations.
+    theta : list, set, array
+        Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
+        that you have solved for the ``theta`` that correspond to the root of the input estimating equations.
+    deriv_method : str, optional
+        Method to compute the derivative of the estimating equations for the bread matrix. Options include numerical
+        approximation via the central difference method (``'approx'``) and forward-mode automatic differentiation
+        (``'exact'``). Default is ``'approx'``.
+    dx : float, optional
+        Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
+        for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
+        used when ``deriv_method='approx'``. Default is 1e-9.
+    allow_pinv : bool, optional
+        Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
+        non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this
+        argument to ``False``. Default is ``True``, which  is more robust to the possible bread matrices.
+
+    Returns
+    -------
+
+    """
+    # Evaluating at provided theta values
+    evald_theta = np.asarray(stacked_equations(theta=theta))        # Evaluating EE at theta-hat
+    if len(theta) == 1:                                             #
+        n_obs = evald_theta.shape[0]                                # Number of observations
+    else:                                                           #
+        n_obs = evald_theta.shape[1]                                # Number of observations
+
+    # Step 1: Compute the bread matrix
+    bread = compute_bread(stacked_equations=stacked_equations,      #
+                          theta=theta,                              # Provide theta-hat
+                          deriv_method=deriv_method,                # Method to use
+                          dx=dx)                                    #
+    bread = bread / n_obs                                           #
+
+    # Step 2: Compute the meat matrix
+    meat = compute_meat(stacked_equations=stacked_equations,        #
+                        theta=theta)                                #
+    meat = meat / n_obs                                             # Meat is dot product of arrays
+
+    # Step 3: Construct sandwich from the bread and meat matrices
+    sandwich = build_sandwich(bread=bread,                          #
+                              meat=meat,                            #
+                              allow_pinv=allow_pinv)                #
+
+    # Return the constructed empirical sandwich variance estimator
+    return sandwich
+
+
+def compute_bread(stacked_equations, theta, deriv_method, dx=1e-9):
+    """
+
+    Parameters
+    ----------
+    stacked_equations
+    theta
+    deriv_method
+    dx
+
+    Returns
+    -------
+
+    """
+    def estimating_equation(input_theta):
+        if len(input_theta) == 1:
+            return np.sum(stacked_equations(theta=input_theta))
+        else:
+            return np.sum(stacked_equations(theta=input_theta), axis=1)
+
+    # Compute the derivative
+    if deriv_method.lower() == 'approx':
+        bread_matrix = approx_fprime(xk=theta,
+                                     f=estimating_equation,
+                                     epsilon=dx)
+        if len(theta) == 1:
+            bread_matrix = np.asarray([bread_matrix, ])
+    elif deriv_method.lower() == 'capprox':
+        bread_matrix = approx_differentiation(xk=theta,
+                                              f=estimating_equation,
+                                              method='capprox',
+                                              epsilon=dx)
+    elif deriv_method.lower() == 'fapprox':
+        bread_matrix = approx_differentiation(xk=theta,
+                                              f=estimating_equation,
+                                              method='fapprox',
+                                              epsilon=dx)
+    elif deriv_method.lower() == 'bapprox':
+        bread_matrix = approx_differentiation(xk=theta,
+                                              f=estimating_equation,
+                                              method='bapprox',
+                                              epsilon=dx)
+    elif deriv_method.lower() == "exact":  # Automatic Differentiation
+        bread_matrix = auto_differentiation(xk=theta,  # Compute the exact derivative at theta
+                                            f=estimating_equation)  # ... for the given estimating equations
+    else:
+        raise ValueError("The input for deriv_method was "
+                         + str(deriv_method)
+                         + ", but only 'approx', 'fapprox', 'capprox', 'bapprox' "
+                           "and 'exact' are available.")
+
+    # Checking for an issue when trying to invert the bread matrix
+    if np.isnan(bread_matrix).any():
+        warnings.warn("The bread matrix contains at least one np.nan, so it cannot be inverted. The variance will "
+                      "not be calculated. This may be an issue with the provided estimating equations or the "
+                      "evaluated theta.",
+                      UserWarning)
+
+    # Returning the constructed bread matrix according to SB 2002
+    return -1 * bread_matrix
+
+
+def compute_meat(stacked_equations, theta):
+    """
+
+    Parameters
+    ----------
+    stacked_equations
+    theta
+
+    Returns
+    -------
+
+    """
+    evald_theta = np.asarray(stacked_equations(theta=theta))  # Evaluating EE at theta-hat
+    return np.dot(evald_theta, evald_theta.T)
+
+
+def build_sandwich(bread, meat, allow_pinv):
+    """
+
+    Parameters
+    ----------
+    bread
+    meat
+    allow_pinv
+
+    Returns
+    -------
+
+    """
+    if np.any(np.isnan(bread)):
+        return np.nan
+
+    if allow_pinv:  # Support 1D theta-hat
+        bread_invert = np.linalg.pinv(bread)  # ... find pseudo-inverse
+    else:  # Support 1D theta-hat
+        bread_invert = np.linalg.inv(bread)  # ... find inverse
+    sandwich = np.dot(np.dot(bread_invert, meat), bread_invert.T)  # Compute sandwich
+    return sandwich

From 1f80289ff6d11a6c22b1af7ac6adf865791d6636 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Fri, 5 Apr 2024 12:01:09 -0400
Subject: [PATCH 04/18] updating bread error

---
 tests/test_MEstimation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_MEstimation.py b/tests/test_MEstimation.py
index 3ad083c..d6e24a5 100644
--- a/tests/test_MEstimation.py
+++ b/tests/test_MEstimation.py
@@ -157,7 +157,7 @@ def psi(theta):
 
         # Ensuring variance is None but point estimates still exist
         assert mestr.theta is not None
-        assert mestr.variance is None
+        assert np.isnan(mestr.variance)
 
     def test_mean_variance_1eq(self):
         """Tests the mean / variance with a single estimating equation.

From 56e29085a4cbb971e9c625418298e85dec1dfaa1 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sat, 6 Apr 2024 08:59:33 -0400
Subject: [PATCH 05/18] new sandwich comp docs

---
 delicatessen/sandwich.py | 202 ++++++++++++++++++++++++++++++---------
 1 file changed, 159 insertions(+), 43 deletions(-)

diff --git a/delicatessen/sandwich.py b/delicatessen/sandwich.py
index cf99de1..cc0670e 100644
--- a/delicatessen/sandwich.py
+++ b/delicatessen/sandwich.py
@@ -1,3 +1,10 @@
+#####################################################################################################################
+# Functionality to compute the sandwich
+#   This script allows for computation of the empirical sandwich variance estimator with just the
+#   parameter values and estimating equations. This is to allow computing the sandwich quickly without
+#   called the MEstimator procedure itself.
+#####################################################################################################################
+
 import warnings
 
 import numpy as np
@@ -7,7 +14,28 @@
 
 
 def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, allow_pinv=True):
-    """Compute the empirical sandwich variance estimator given [...] and parameter estimates.
+    """Compute the empirical sandwich variance estimator given a set of estimating equations and parameter estimates.
+    Note that this functionality does not solve for the parameter estimates (unlike ``MEstimator``). Instead, it
+    only computes the sandwich for the provided value.
+
+    The empirical sandwich variance estimator is defined as
+
+    .. math::
+
+        V_n(O_i; \theta) = B_n(O_i; \theta)^{-1} F_n(O_i; \theta) \left[ B_n(O_i; \theta)^{-1} \right]^{T}
+
+    where :math:`B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial \psi(O_i; \theta)}{\partial \theta}`,
+    :math:`F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T`, and :math:`\psi(O_i; \theta)` is the
+    estimating function.
+
+    To compute the bread matrix, :math:`B_n`, the matrix of partial derivatives is computed by using either finite
+    difference methods or automatic differentiation. For finite differences, the default is to use SciPy's
+    ``approx_fprime`` functionality, which uses forward finite differences. However, you can also use homebrew version
+    that allows for forward, backward, and center differences. Automatic differentiation is also supported by a
+    homebrew version.
+
+    To compute the meat matrix, :math:`F_n`, only linear algebra methods, implemented through NumPy, are necessary.
+    The sandwich is then constructed from these individual pieces.
 
     Parameters
     ----------
@@ -19,12 +47,13 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
         that you have solved for the ``theta`` that correspond to the root of the input estimating equations.
     deriv_method : str, optional
         Method to compute the derivative of the estimating equations for the bread matrix. Options include numerical
-        approximation via the central difference method (``'approx'``) and forward-mode automatic differentiation
-        (``'exact'``). Default is ``'approx'``.
+        approximation via the forward difference method via SciPy (``'approx'``), forward difference implemented by-hand
+        (`'fapprox'`), backward difference implemented by-hand (`'bapprox'`),  central difference implemented by-hand
+        (`'capprox'`), or forward-mode automatic differentiation (``'exact'``). Default is ``'approx'``.
     dx : float, optional
         Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
         for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-        used when ``deriv_method='approx'``. Default is 1e-9.
+        used when numerical approximation methods. Default is 1e-9.
     allow_pinv : bool, optional
         Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
         non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this
@@ -32,49 +61,110 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
 
     Returns
     -------
+    array :
+        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+
+    Examples
+    --------
+    Loading necessary functions and building a generic data set for estimation of the mean
+
+    >>> import numpy as np
+    >>> from delicatessen import MEstimator
+    >>> from delicatessen import compute_sandwich
+    >>> from delicatessen.estimating_equations import ee_mean_variance
+
+    >>> y_dat = [1, 2, 4, 1, 2, 3, 1, 5, 2]
+
+    The following is an illustration of how to compute sandwich covariance using only an estimating equation and the
+    paramter values. The mean and variance (that correspond to ``ee_mean_variance``) can be computed using NumPy by
+
+    >>> mean = np.mean(y_dat)
+    >>> var = np.var(y_dat, ddof=0)
+
+    For the corresponding estimating equation, we can use the built-in functionality as done below
+
+    >>> def psi(theta):
+    >>>     return ee_mean_variance(theta=theta, y=y_dat)
 
+    Calling the sandwich computation procedure
+
+    >>> sandwich_asymp = compute_sandwich(stacked_equations=psi, theta=[mean, var])
+
+    The output sandwich is the *asymptotic* variance (or the variance that corresponds to the standard deviation). To
+    get the variance (or the variance that corresponds to the standard error), we rescale ``sandwich`` by the number of
+    observations
+
+    >>> sandwich = sandwich_asymp / len(y_dat)
+
+    References
+    ----------
+    Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
+    (pp. 297-337). Springer, New York, NY.
+
+    Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. The American Statistician, 56(1), 29-38.
     """
     # Evaluating at provided theta values
     evald_theta = np.asarray(stacked_equations(theta=theta))        # Evaluating EE at theta-hat
-    if len(theta) == 1:                                             #
-        n_obs = evald_theta.shape[0]                                # Number of observations
-    else:                                                           #
-        n_obs = evald_theta.shape[1]                                # Number of observations
+    if len(theta) == 1:                                             # Number of parameters
+        n_obs = evald_theta.shape[0]                                # ... to get number of obs
+    else:                                                           # Number of parameters
+        n_obs = evald_theta.shape[1]                                # ... to get number of obs
 
     # Step 1: Compute the bread matrix
-    bread = compute_bread(stacked_equations=stacked_equations,      #
-                          theta=theta,                              # Provide theta-hat
-                          deriv_method=deriv_method,                # Method to use
-                          dx=dx)                                    #
-    bread = bread / n_obs                                           #
+    bread = compute_bread(stacked_equations=stacked_equations,      # Call the bread matrix function
+                          theta=theta,                              # ... at given theta-hat
+                          deriv_method=deriv_method,                # ... with derivative method
+                          dx=dx)                                    # ... and approximation
+    bread = bread / n_obs                                           # Scale bread by number of obs
 
     # Step 2: Compute the meat matrix
-    meat = compute_meat(stacked_equations=stacked_equations,        #
-                        theta=theta)                                #
-    meat = meat / n_obs                                             # Meat is dot product of arrays
+    meat = compute_meat(stacked_equations=stacked_equations,        # Call the meat matrix function
+                        theta=theta)                                # ... at given theta-hat
+    meat = meat / n_obs                                             # Scale meat by number of obs
 
     # Step 3: Construct sandwich from the bread and meat matrices
-    sandwich = build_sandwich(bread=bread,                          #
-                              meat=meat,                            #
-                              allow_pinv=allow_pinv)                #
+    sandwich = build_sandwich(bread=bread,                          # Call the sandwich constructor
+                              meat=meat,                            # ... with bread and meat matrices above
+                              allow_pinv=allow_pinv)                # ... and whether to allow pinv
 
     # Return the constructed empirical sandwich variance estimator
     return sandwich
 
 
 def compute_bread(stacked_equations, theta, deriv_method, dx=1e-9):
-    """
+    """Function to compute the bread matrix. The bread matrix is defined as
+
+    .. math::
+
+        B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial \psi(O_i; \theta)}{\partial \theta}
+
+    The matrix of partial derivatives is computed by using either finite difference methods or automatic
+    differentiation. For finite differences, the default is to use SciPy's ``approx_fprime`` functionality, which uses
+    forward finite differences. However, you can also use homebrew version that allows for forward, backward, and
+    center differences. Automatic differentiation is also supported by a homebrew version.
 
     Parameters
     ----------
-    stacked_equations
-    theta
-    deriv_method
-    dx
+    stacked_equations : function, callable
+        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        documentation for how to construct a set of estimating equations.
+    theta : list, set, array
+        Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
+        that you have solved for the ``theta`` that correspond to the root of the input estimating equations.
+    deriv_method : str, optional
+        Method to compute the derivative of the estimating equations for the bread matrix. Options include numerical
+        approximation via the forward difference method via SciPy (``'approx'``), forward difference implemented by-hand
+        (`'fapprox'`), backward difference implemented by-hand (`'bapprox'`),  central difference implemented by-hand
+        (`'capprox'`), or forward-mode automatic differentiation (``'exact'``). Default is ``'approx'``.
+    dx : float, optional
+        Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
+        for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
+        used when numerical approximation methods. Default is 1e-9.
 
     Returns
     -------
-
+    array :
+        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
     """
     def estimating_equation(input_theta):
         if len(input_theta) == 1:
@@ -125,40 +215,66 @@ def estimating_equation(input_theta):
 
 
 def compute_meat(stacked_equations, theta):
-    """
+    """Function to compute the meat matrix. The meat matrix is defined as
+
+    .. math::
+
+        F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T
+
+    Rather than summing over all the individual contributions, this implementation takes a single dot product of the
+    stacked estimating functions. This implementation is much faster than summing over :math:`n` matrices.
 
     Parameters
     ----------
-    stacked_equations
-    theta
+    stacked_equations : function, callable
+        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        documentation for how to construct a set of estimating equations.
+    theta : list, set, array
+        Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
+        that you have solved for the ``theta`` that correspond to the root of the input estimating equations.
 
     Returns
     -------
-
+    array :
+        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
     """
     evald_theta = np.asarray(stacked_equations(theta=theta))  # Evaluating EE at theta-hat
-    return np.dot(evald_theta, evald_theta.T)
+    return np.dot(evald_theta, evald_theta.T)                 # Return the fast dot product calculation
 
 
 def build_sandwich(bread, meat, allow_pinv):
-    """
+    """Function to combine the sandwich elements together. This function takes the bread and meat matrices, does the
+    inversions, and then combines them together. This function is separate from ``compute_sandwich`` as it is called
+    by both ``compute_sandwich`` and ``MEstimator``.
 
     Parameters
     ----------
-    bread
-    meat
-    allow_pinv
+    bread : ndarray
+        The bread matrix. The expected input is the output from the ``compute_bread`` function
+    meat : ndarray
+        The meat matrix. The expected input is the output from the ``compute_meat`` function
+    allow_pinv : bool, optional
+        Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
+        non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this
+        argument to ``False``. Default is ``True``, which  is more robust to the possible bread matrices.
 
     Returns
     -------
-
+    array :
+        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
     """
-    if np.any(np.isnan(bread)):
-        return np.nan
-
-    if allow_pinv:  # Support 1D theta-hat
-        bread_invert = np.linalg.pinv(bread)  # ... find pseudo-inverse
-    else:  # Support 1D theta-hat
-        bread_invert = np.linalg.inv(bread)  # ... find inverse
-    sandwich = np.dot(np.dot(bread_invert, meat), bread_invert.T)  # Compute sandwich
+    # Check if there is an issue with the bread matrix
+    if np.any(np.isnan(bread)):                                   # If bread contains NaN, breaks
+        return np.nan                                             # ... so give back a NaN
+
+    # Compute the bread inversion
+    if allow_pinv:                                                 # Allowing the pseudo-inverse
+        bread_invert = np.linalg.pinv(bread)                       # ... then call pinv
+    else:                                                          # Only allowing the actual inverse
+        bread_invert = np.linalg.inv(bread)                        # ... then call inv
+
+    # Compute the sandwich variance
+    sandwich = np.dot(np.dot(bread_invert, meat), bread_invert.T)
+
+    # Return the sandwich covariance matrix
     return sandwich

From 29e3f7452c900c10e57a88105bd032f818f89232 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sat, 6 Apr 2024 09:09:48 -0400
Subject: [PATCH 06/18] Update M-estimator docs

---
 delicatessen/mestimation.py | 42 +++++++++++++++++++++++--------------
 delicatessen/sandwich.py    |  2 +-
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/delicatessen/mestimation.py b/delicatessen/mestimation.py
index a5974e2..c42566c 100644
--- a/delicatessen/mestimation.py
+++ b/delicatessen/mestimation.py
@@ -1,3 +1,7 @@
+#####################################################################################################################
+# Implementation of the M-estimator
+#####################################################################################################################
+
 import numpy as np
 from scipy.optimize import newton, root
 from scipy.stats import norm
@@ -8,9 +12,9 @@
 class MEstimator:
     r"""M-Estimator for stacked estimating equations.
 
-    M-Estimation, or loosely referred to as estimating equations, is a general approach to point and variance
-    estimation that consists of defining an estimator as the solution to an estimating equation. M-estimators
-    satisify the following
+    Estimating equations are a general approach to point and variance estimation that consists of defining an estimator
+    as the solution to a vector of equations that are equal to zero. The corresponding estimators, often called
+    M-estimators or Z-estimators, satisify the following equation
 
     .. math::
 
@@ -22,12 +26,12 @@ class MEstimator:
 
     Note
     ----
-    M-Estimation is advantageous in both theoretical and applied research. M-estimation simplifies proofs of
+    Estimating equations are advantageous in both theoretical and applied research. They simplifies proofs of
     consistency and asymptotic normality of estimators under a large-sample approximation framework. In application,
-    M-estimators simplify estimation of the variance of parameters and automate the delta-method.
+    this approach to esitmation simplifies estimation of the variance of parameters and automates the delta-method.
 
 
-    M-Estimation consists of two broad step: point estimation and variance estimation. Point estimation is carried out
+    M-Estimators consists of two broad step: point estimation and variance estimation. Point estimation is carried out
     by determining the values of :math:`\theta` where the sum of the estimating equations are zero. For variance
     estimation, the asymptotic sandwich variance estimator is used, which consists of
 
@@ -39,13 +43,13 @@ class MEstimator:
 
     .. math::
 
-        B_n(O, \hat{\theta}) = n^{-1} \sum_{i=1}^{n} - \psi'(O_i, \hat{\theta})
+        B_n(O, \hat{\theta}) = n^{-1} \sum_{i=1}^{n} - \frac{\partial}{\partial \theta} \psi(O_i, \hat{\theta})
 
     .. math::
 
         F_n(O, \hat{\theta}) = n^{-1} \sum_{i=1}^{n} \psi(O_i, \hat{\theta}) \psi(O_i, \hat{\theta})^T
 
-    The partial derivatives for the bread are calculated using either numerical approximation (i.e., central difference
+    The partial derivatives for the bread are calculated using either numerical approximation (e.g., forward difference
     method) or forward-mode automatic differentiation. Inverting the bread is done via NumPy's ``linalg.pinv``. For
     the filling, the dot product is taken at :math:`\hat{\theta}`.
 
@@ -113,6 +117,9 @@ class MEstimator:
     >>> np.sqrt(np.diag(estr.asymptotic_variance))  # Standard deviation
     >>> np.sqrt(np.diag(estr.variance))             # Standard error
     >>> estr.confidence_intervals()                 # Confidence intervals
+    >>> estr.z_scores()                             # Z-scores
+    >>> estr.p_values()                             # P-values
+    >>> estr.s_values()                             # S-values
 
     Alternatively, a custom estimating equation can be specified. This is done by constructing a valid estimating
     equation for the ``MEstimator``. The ``MEstimator`` expects the ``psi`` function to return a b-by-n array, where b
@@ -150,6 +157,8 @@ class MEstimator:
     >>> estr = MEstimator(stacked_equations=psi, init=[0, 0, ])
     >>> estr.estimate(solver=custom_solver)
 
+    For more examples on how to apply ``MEstimator``, see https://deli.readthedocs.io/en/latest/
+
     References
     ----------
     Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
@@ -196,12 +205,14 @@ def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='appr
             ``tol`` parameter. This argument is not used for user-specified solvers. Default is 1e-9.
         deriv_method : str, optional
             Method to compute the derivative of the estimating equations for the bread matrix. Options include numerical
-            approximation via the central difference method (``'approx'``) and forward-mode automatic differentiation
-            (``'exact'``). Default is ``'approx'``.
+            approximation via the forward difference method via SciPy (``'approx'``), forward difference implemented
+            by-hand (`'fapprox'`), backward difference implemented by-hand (`'bapprox'`),  central difference
+            implemented by-hand (`'capprox'`), or forward-mode automatic differentiation (``'exact'``).
+            Default is ``'approx'``.
         dx : float, optional
             Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
             for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-            used when ``deriv_method='approx'``. Default is 1e-9.
+            used with numerical approximation methods. Default is 1e-9.
         allow_pinv : bool, optional
             Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
             non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this
@@ -261,10 +272,9 @@ def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='appr
         #   the init values then passing them along to root(). Behind the scenes, self._mestimation_answer_() expands
         #   the parameters (to include everything), calculates the estimating equation at those values, and then
         #   extracts the corresponding subset.
-        #   This process only takes place within Step 1 (the sandwich variance did not require any corresponding
-        #   updates). There is an inherent danger with this process in that if non-subset parameters are not pre-washed,
-        #   then the returned parameters will not be correct. I am considering adding a warning for self_subset_, but I
-        #   may just have to trust the user...
+        #   This process only takes place within Step 1. There is an inherent danger with this process in that if
+        #   non-subset parameters are not pre-washed, then the returned parameters will not be correct. I am
+        #   considering adding a warning for self_subset_, but I currently just trust the user...
 
         # Processing initial values based on whether subset option was specified
         if self._subset_ is None:                        # If NOT subset,
@@ -291,7 +301,7 @@ def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='appr
         # STEP 2: calculating the sandwich variance
         # After solving for the parameters, we now can compute the empirical sandwich variance estimator. This is
         #   done by compute the bread and meat matrices and then combining them. This is now done by a separate
-        #   functionality within `sandwich.py`.
+        #   functionalities within the `sandwich.py` file as of v2.2.
         # STEP 2.1: baking the Bread
         self.bread = compute_bread(stacked_equations=self.stacked_equations,
                                    theta=self.theta,
diff --git a/delicatessen/sandwich.py b/delicatessen/sandwich.py
index cc0670e..8d4b2e5 100644
--- a/delicatessen/sandwich.py
+++ b/delicatessen/sandwich.py
@@ -53,7 +53,7 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
     dx : float, optional
         Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
         for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-        used when numerical approximation methods. Default is 1e-9.
+        used with numerical approximation methods. Default is 1e-9.
     allow_pinv : bool, optional
         Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
         non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this

From faeb843462cb837ca4836b66d5a1309b2cac3b59 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sat, 6 Apr 2024 09:17:57 -0400
Subject: [PATCH 07/18] approx homebrew docs update

---
 delicatessen/derivative.py | 58 ++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/delicatessen/derivative.py b/delicatessen/derivative.py
index 6fe5b12..349a737 100644
--- a/delicatessen/derivative.py
+++ b/delicatessen/derivative.py
@@ -87,43 +87,45 @@ def approx_differentiation(xk, f, epsilon=1e-9, method='capprox'):
     >>> def f(x):
     >>>     return [x[0]**2 - x[1], np.sin(np.sqrt(x[1]) + x[2]) + x[2]*(x[1]**2)]
 
-    >>> dy = approx_differentiation(xk=[0.7, 1.2, -0.9], f=f)
+    >>> approx_differentiation(xk=[0.7, 1.2, -0.9], f=f, method='fapprox')
+    >>> approx_differentiation(xk=[0.7, 1.2, -0.9], f=f, method='bapprox')
+    >>> approx_differentiation(xk=[0.7, 1.2, -0.9], f=f, method='capprox')
 
     which will return a 2-by-3 array of all the x-y pair derivatives at the given values. Here, the rows correspond to
-    the output and the columns correspond to the inputs.
+    the output and the columns correspond to the inputs. The approximation methods are forward, backward, and central.
     """
     # Setup parameters for call
-    xk = np.asarray(xk)
-    xp = xk.shape[0]
-    shift = np.identity(n=xk.shape[0]) * epsilon
+    xk = np.asarray(xk)                               # Convert inputs into NumPy array if not already
+    xp = xk.shape[0]                                  # Get the number of parameters in the input
+    shift = np.identity(n=xk.shape[0]) * epsilon      # Define the shift matrix for the partials
 
     def generate_matrix(x_shift, f):
         """Internal function to generate a matrix of the outputs under the parameter shifts, defined by x_shift"""
-        shift_matrix = []
-        for j in range(xp):
-            shift_matrix.append(f(x_shift[j, :]))
-        return np.asarray(shift_matrix)
+        shift_matrix = []                             # Storage for matrices
+        for j in range(xp):                           # Looping over shift combinations
+            shift_matrix.append(f(x_shift[j, :]))     # ... compute output at shifted values
+        return np.asarray(shift_matrix)               # Return matrix under all shifts
 
     # Computing the gradient using the corresponding method
-    if method == 'capprox':
-        lower = (xk - shift)
-        upper = (xk + shift)
-        f0 = generate_matrix(x_shift=lower, f=f)
-        f1 = generate_matrix(x_shift=upper, f=f)
-        deriv = (f1 - f0).T / (2*epsilon)
-    elif method == 'fapprox':
-        lower = (xk - shift)
-        f0 = generate_matrix(x_shift=lower, f=f)
-        f_eval = f(xk)
-        f1 = np.asarray([f_eval for i in range(xp)])
-        deriv = (f1 - f0).T / epsilon
-    elif method == 'bapprox':
-        f_eval = f(xk)
-        f0 = np.asarray([f_eval for i in range(xp)])
-        upper = (xk + shift)
-        f1 = generate_matrix(x_shift=upper, f=f)
-        deriv = (f1 - f0).T / epsilon
-    else:
+    if method == 'capprox':                           # Central difference
+        lower = (xk - shift)                          # ... defining lower shift
+        f0 = generate_matrix(x_shift=lower, f=f)      # ... output for lower shift
+        upper = (xk + shift)                          # ... defining upper shift
+        f1 = generate_matrix(x_shift=upper, f=f)      # ... output for upper shift
+        deriv = (f1 - f0).T / (2*epsilon)             # ... central difference approximation
+    elif method == 'bapprox':                         # Backward difference
+        lower = (xk - shift)                          # ... defining lower shift
+        f0 = generate_matrix(x_shift=lower, f=f)      # ... output for lower shift
+        f_eval = f(xk)                                # ... upper is held fixed
+        f1 = np.asarray([f_eval for i in range(xp)])  # ... stack upper into a matrix
+        deriv = (f1 - f0).T / epsilon                 # ... backward difference approximation
+    elif method == 'fapprox':                         # Forward difference
+        f_eval = f(xk)                                # ... lower is held fixed
+        f0 = np.asarray([f_eval for i in range(xp)])  # ... stack lower into a matrix
+        upper = (xk + shift)                          # ... defining upper shift
+        f1 = generate_matrix(x_shift=upper, f=f)      # ... output for upper shift
+        deriv = (f1 - f0).T / epsilon                 # ... forward difference approximation
+    else:                                             # Otherwise error
         raise ValueError("Method chosen is not supported")
 
     # Processing the final return based on parameter shape

From 6bda6aa0dc099b4df538fd5478194467ba5165eb Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sat, 6 Apr 2024 09:34:01 -0400
Subject: [PATCH 08/18] getting approx-deriv to play nicely

---
 delicatessen/derivative.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/delicatessen/derivative.py b/delicatessen/derivative.py
index 349a737..55621fc 100644
--- a/delicatessen/derivative.py
+++ b/delicatessen/derivative.py
@@ -129,7 +129,10 @@ def generate_matrix(x_shift, f):
         raise ValueError("Method chosen is not supported")
 
     # Processing the final return based on parameter shape
-    return deriv
+    if xp == 1:
+        return np.asarray([deriv, ])
+    else:
+        return deriv
 
 
 def auto_differentiation(xk, f):

From 98abc3bccad70071ee4adfce0517874609288211 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sat, 6 Apr 2024 09:34:21 -0400
Subject: [PATCH 09/18] updating differentiation tests to include approx

---
 ...st_autodiff.py => test_differentiation.py} | 286 +++++++++++++++++-
 1 file changed, 284 insertions(+), 2 deletions(-)
 rename tests/{test_autodiff.py => test_differentiation.py} (84%)

diff --git a/tests/test_autodiff.py b/tests/test_differentiation.py
similarity index 84%
rename from tests/test_autodiff.py
rename to tests/test_differentiation.py
index 24203c8..59379c9 100644
--- a/tests/test_autodiff.py
+++ b/tests/test_differentiation.py
@@ -1,5 +1,5 @@
 ####################################################################################################################
-# Tests for automatic differentiation procedures
+# Tests for differentiation procedures
 ####################################################################################################################
 
 import pytest
@@ -10,7 +10,7 @@
 from scipy.stats import logistic
 from scipy.optimize import approx_fprime
 
-from delicatessen.derivative import auto_differentiation
+from delicatessen.derivative import auto_differentiation, approx_differentiation
 from delicatessen.utilities import inverse_logit, identity, polygamma, standard_normal_cdf, standard_normal_pdf
 from delicatessen import MEstimator
 from delicatessen.data import load_inderjit
@@ -468,6 +468,168 @@ def f(x):
         npt.assert_allclose(dx_true, dx_exact, atol=1e-5)
 
 
+class TestApproxDifferentiation:
+
+    def test_single_evaluation(self):
+        def f(x):
+            return -32 + 4*x - 10*x**2
+
+        # Points to Evaluate at
+        xinput = [2.754, ]
+
+        # Evaluating the derivatives at the points
+        dx_byhand = approx_differentiation(xinput, f, method='fapprox')
+        dx_approx = approx_fprime(xinput, f, epsilon=1e-9)
+
+        # Checking
+        npt.assert_allclose(dx_approx, dx_byhand[0][0], atol=1e-5)
+
+    def test_doc_example(self):
+        # First function to check
+        def f(x):
+            return x ** 2 - x + np.sin(x + np.sqrt(x))
+
+        dy_approx = approx_differentiation(xk=[1, ], f=f)
+        dy_exact = auto_differentiation(xk=[1, ], f=f)
+        npt.assert_allclose(dy_exact, dy_approx[0][0], atol=1e-5)
+
+        # Second function to check
+        def f(x):
+            return [x[0]**2 - x[1], np.sin(np.sqrt(x[1]) + x[2]) + x[2]*(x[1]**2)]
+
+        dy_approx = approx_differentiation(xk=[0.7, 1.2, -0.9], f=f)
+        dy_exact = auto_differentiation(xk=[0.7, 1.2, -0.9], f=f)
+        npt.assert_allclose(dy_exact, dy_approx, atol=1e-5)
+
+    def test_compare_elementary_operators_f(self):
+        # Defining the functions to check
+        def f(x):
+            return [10,
+                    10 - 5 + 32*5 - 6**2,
+                    5 + x[0] + x[1] - x[2] - x[3],
+                    -32 + x[0]*x[2] + x[1]*x[3],
+                    x[1]**2 + x[0] + x[3] - 30,
+                    -32 + x[0]**x[2] + x[1]**x[3],
+                    (x[0] + x[1])**(x[2] + x[3]) + 6,
+                    5*x[1]**2 + (x[2]**2)*5,
+                    x[1] / 10 + (10/x[2])**2,
+                    x[0]**x[1],
+                    0.9**x[2],
+                    (x[3] + 0.9)**(x[1] * x[0] - 0.1),
+                    ]
+
+        # Points to Evaluate at
+        xinput = [0.5, 1.9, -2.3, 2]
+
+        # Evaluating the derivatives at the points
+        dx_byhand = approx_differentiation(xinput, f, epsilon=1e-9, method='fapprox')
+        dx_approx = approx_fprime(xinput, f, epsilon=1e-9)
+
+        # Checking
+        npt.assert_allclose(dx_approx, dx_byhand, atol=1e-5)
+
+    def test_compare_elementary_operators_b(self):
+        # Defining the functions to check
+        def f(x):
+            return [10,
+                    10 - 5 + 32*5 - 6**2,
+                    5 + x[0] + x[1] - x[2] - x[3],
+                    -32 + x[0]*x[2] + x[1]*x[3],
+                    x[1]**2 + x[0] + x[3] - 30,
+                    -32 + x[0]**x[2] + x[1]**x[3],
+                    (x[0] + x[1])**(x[2] + x[3]) + 6,
+                    5*x[1]**2 + (x[2]**2)*5,
+                    x[1] / 10 + (10/x[2])**2,
+                    x[0]**x[1],
+                    0.9**x[2],
+                    (x[3] + 0.9)**(x[1] * x[0] - 0.1),
+                    ]
+
+        # Points to Evaluate at
+        xinput = [0.5, 1.9, -2.3, 2]
+
+        # Evaluating the derivatives at the points
+        dx_byhand = approx_differentiation(xinput, f, epsilon=1e-9, method='bapprox')
+        dx_approx = approx_fprime(xinput, f, epsilon=1e-9)
+
+        # Checking
+        npt.assert_allclose(dx_approx, dx_byhand, atol=1e-5)
+
+    def test_compare_elementary_operators_c(self):
+        # Defining the functions to check
+        def f(x):
+            return [10,
+                    10 - 5 + 32*5 - 6**2,
+                    5 + x[0] + x[1] - x[2] - x[3],
+                    -32 + x[0]*x[2] + x[1]*x[3],
+                    x[1]**2 + x[0] + x[3] - 30,
+                    -32 + x[0]**x[2] + x[1]**x[3],
+                    (x[0] + x[1])**(x[2] + x[3]) + 6,
+                    5*x[1]**2 + (x[2]**2)*5,
+                    x[1] / 10 + (10/x[2])**2,
+                    x[0]**x[1],
+                    0.9**x[2],
+                    (x[3] + 0.9)**(x[1] * x[0] - 0.1),
+                    ]
+
+        # Points to Evaluate at
+        xinput = [0.5, 1.9, -2.3, 2]
+
+        # Evaluating the derivatives at the points
+        dx_byhand = approx_differentiation(xinput, f, epsilon=1e-9, method='capprox')
+        dx_approx = approx_fprime(xinput, f, epsilon=1e-9)
+
+        # Checking
+        npt.assert_allclose(dx_approx, dx_byhand, atol=1e-5)
+
+    def test_compare_equality1_operators(self):
+        # Defining the functions to check
+        def f(x):
+            return [(x[0] <= 0.1)*x[0] + x[1]**x[2],
+                    (x[0] < 0.1)*x[0] + x[1]**x[2],
+                    (x[0] >= 0.1) * x[0] + x[1] ** x[2],
+                    (x[0] > 0.1) * x[0] + x[1] ** x[2],
+                    (x[0] >= 5.0)*x[0] + x[1]**x[2],
+                    (x[0] > 5.0)*x[0] + x[1]**x[2],
+                    (x[0] <= 5.0)*x[0] + x[1]**x[2],
+                    (x[0] < 5.0)*x[0] + x[1]**x[2],
+                    (x[0] <= 5.1)*(x[0] <= 7.0)*(x[0] ** 2.5)*(x[0] + 3)**0.5 + 27*x[0]**3,
+                    (x[0] < 5.1) * (x[0] + x[1] ** 2) ** 3,
+                    ]
+
+        # Points to Evaluate at
+        xinput = [0.5, 1.9, -2.3, 2]
+
+        # Evaluating the derivatives at the points
+        dx_byhand = approx_differentiation(xinput, f, method='fapprox')
+        dx_approx = approx_fprime(xinput, f, epsilon=1e-9)
+
+        # Checking
+        npt.assert_allclose(dx_approx, dx_byhand, atol=1e-5)
+
+    def test_scipy_special_numapprox(self):
+        def f(x):
+            return [polygamma(n=1, x=x[0]),
+                    polygamma(n=2, x=x[1]) + x[1]**2,
+                    polygamma(n=3, x=x[2]*x[3] + x[1]),
+                    polygamma(n=4, x=np.log(x[3] + x[1]) + x[0]**2) - x[3],
+                    standard_normal_cdf(x=x[1]),
+                    standard_normal_pdf(x=x[2]),
+                    ]
+
+        # Points to Evaluate at
+        xinput = [0.5, 1.9, -2.3, 2]
+
+        # Approximate Derivatives
+        dx_approx = approx_fprime(xinput, f, epsilon=1e-9)
+
+        # Evaluating the derivatives at the points
+        dx_byhand = approx_differentiation(xinput, f, method='fapprox')
+
+        # Checking
+        npt.assert_allclose(dx_approx, dx_byhand, atol=1e-5)
+
+
 class TestSandwichAutoDiff:
 
     # Basics
@@ -1532,3 +1694,123 @@ def psi(theta):
         npt.assert_allclose(var_approx,
                             var_exact,
                             atol=1e-6)
+
+
+class TestSandwichApproxDiff:
+
+    def test_capprox_bread_mean(self):
+        # Data set
+        y = np.array([5, 1, 2, 4, 2, 4, 5, 7, 11, 1, 6, 3, 4, 6])
+
+        def psi(theta):
+            return y - theta
+
+        mestr = MEstimator(psi, init=[0, ])
+        mestr.estimate(deriv_method='capprox')
+
+        # Checking bread estimates
+        npt.assert_allclose(mestr.bread,
+                            [[1]],
+                            rtol=1e-6)
+
+        # Checking variance estimates
+        npt.assert_allclose(mestr.asymptotic_variance,
+                            np.var(y, ddof=0),
+                            rtol=1e-6)
+
+    def test_capprox_bread_mean_var(self):
+        # Data set
+        y = np.array([5, 1, 2, 4, 2, 4, 5, 7, 11, 1, 6, 3, 4, 6])
+
+        def psi(theta):
+            return ee_mean_variance(theta=theta, y=y)
+
+        mestr = MEstimator(psi, init=[0, 1, ])
+        mestr.estimate(deriv_method='capprox')
+        bread_exact = mestr.bread
+        var_exact = mestr.variance
+        mestr.estimate(deriv_method='approx')
+        bread_approx = mestr.bread
+        var_approx = mestr.variance
+
+        # Checking bread estimates
+        npt.assert_allclose(bread_approx, bread_exact, atol=1e-6)
+
+        # Checking variance estimates
+        npt.assert_allclose(var_approx, var_exact, atol=1e-5)
+
+    def test_capprox_bread_glm_lognb(self):
+        d = pd.DataFrame()
+        d['X'] = [1, -1, 0, 1, 2, 1, -2, -1, 0, 3, -3, 1, 1, -1, -1, -2, 2, 0, -1, 0]
+        d['Z'] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        d['Y'] = [0, 0, 0, 0, 0, 15, 15, 25, 25, 45, 0, 0, 0, 0, 15, 15, 15, 25, 25, 35]
+        d['I'] = 1
+
+        def psi(theta):
+            return ee_glm(theta, X=d[['I', 'X']], y=d['Y'],
+                          distribution='nb', link='log')
+
+        # Auto-differentation
+        mestr = MEstimator(psi, init=[0., 0., 1.])
+        mestr.estimate(solver='lm', deriv_method='capprox')
+        bread_exact = mestr.bread
+        var_exact = mestr.variance
+
+        # Central difference method
+        mestr = MEstimator(psi, init=[0., 0., 1.])
+        mestr.estimate(solver='lm', deriv_method='approx')
+        bread_approx = mestr.bread
+        var_approx = mestr.variance
+
+        # Checking bread estimates
+        npt.assert_allclose(bread_approx, bread_exact, atol=1e-6)
+
+        # Checking variance estimates
+        npt.assert_allclose(var_approx, var_exact, atol=1e-5)
+
+    def test_capprox_aipw(self):
+        d = pd.DataFrame()
+        d['W'] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
+                  1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
+                  1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
+        d['V'] = [1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
+                  1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
+                  1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0]
+        d['A'] = [1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+                  1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
+                  1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1]
+        d['Y'] = [3, 5, 1, 5, 2, 5, 2, 1, 4, 2, 3, 4, 2, 5, 5,
+                  3, 5, 1, 5, 2, 5, 2, 1, 4, 2, 3, 4, 2, 5, 5,
+                  3, 5, 1, 5, 2, 5, 2, 1, 4, 2, 3, 4, 2, 5, 5]
+        d['I'] = 1
+        d['A1'] = 1
+        d['A0'] = 0
+
+        def psi(theta):
+            return ee_aipw(theta, y=d['Y'], A=d['A'],
+                           W=d[['I', 'W']],
+                           X=d[['I', 'A', 'W']],
+                           X1=d[['I', 'A1', 'W']],
+                           X0=d[['I', 'A0', 'W']])
+
+        mestr = MEstimator(psi, init=[0., ]*8)
+
+        # Auto-differentation
+        mestr.estimate(solver='lm', deriv_method='capprox')
+        bread_exact = mestr.bread
+        var_exact = mestr.variance
+
+        # Central difference method
+        mestr.estimate(solver='lm', deriv_method='approx')
+        bread_approx = mestr.bread
+        var_approx = mestr.variance
+
+        # Checking bread estimates
+        npt.assert_allclose(bread_approx,
+                            bread_exact,
+                            atol=1e-6)
+
+        # Checking variance estimates
+        npt.assert_allclose(var_approx,
+                            var_exact,
+                            atol=1e-5)

From 703611c6037495a8b04225255aa2dfef32055871 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sat, 6 Apr 2024 10:13:24 -0400
Subject: [PATCH 10/18] adding tests for new sandwich

---
 delicatessen/sandwich.py |   2 +-
 tests/test_sandwich.py   | 222 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 223 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_sandwich.py

diff --git a/delicatessen/sandwich.py b/delicatessen/sandwich.py
index 8d4b2e5..a0ee89e 100644
--- a/delicatessen/sandwich.py
+++ b/delicatessen/sandwich.py
@@ -242,7 +242,7 @@ def compute_meat(stacked_equations, theta):
     return np.dot(evald_theta, evald_theta.T)                 # Return the fast dot product calculation
 
 
-def build_sandwich(bread, meat, allow_pinv):
+def build_sandwich(bread, meat, allow_pinv=True):
     """Function to combine the sandwich elements together. This function takes the bread and meat matrices, does the
     inversions, and then combines them together. This function is separate from ``compute_sandwich`` as it is called
     by both ``compute_sandwich`` and ``MEstimator``.
diff --git a/tests/test_sandwich.py b/tests/test_sandwich.py
new file mode 100644
index 0000000..ef2a88f
--- /dev/null
+++ b/tests/test_sandwich.py
@@ -0,0 +1,222 @@
+####################################################################################################################
+# Tests for sandwich computations
+####################################################################################################################
+
+import pytest
+import numpy as np
+import numpy.testing as npt
+import pandas as pd
+from delicatessen import compute_sandwich, MEstimator
+from delicatessen.estimating_equations import ee_mean_variance, ee_glm
+from delicatessen.sandwich import compute_bread, compute_meat, build_sandwich
+
+
+@pytest.fixture
+def y():
+    return np.array([5, 1, 2, 4, 2, 4, 5, 7, 11, 1, 6, 3, 4, 6])
+
+
+@pytest.fixture
+def x():
+    return np.array([2, -1, 2, 6, 2, 4, -5, 7, -5, 1, 3, 1, 1, 0])
+
+
+class TestBread:
+
+    def test_error_method(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.mean(y)
+        with pytest.raises(ValueError, match="input for deriv_method"):
+            compute_bread(psi, theta=[mean, ], deriv_method='wrong')
+
+    def test_error_nan(self, y):
+        def psi(theta):
+            return y - theta
+
+        with pytest.warns(UserWarning, match="contains at least one np.nan"):
+            compute_bread(psi, theta=[np.nan, ], deriv_method='approx')
+
+    def test_approx(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.mean(y)
+        bread = compute_bread(psi, theta=[mean, ], deriv_method='approx')
+        npt.assert_allclose([[1*len(y), ], ], bread, atol=1e-7)
+
+    def test_fapprox(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.mean(y)
+        bread = compute_bread(psi, theta=[mean, ], deriv_method='fapprox')
+        npt.assert_allclose([[1*len(y), ], ], bread, atol=1e-7)
+
+    def test_bapprox(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.mean(y)
+        bread = compute_bread(psi, theta=[mean, ], deriv_method='bapprox')
+        npt.assert_allclose([[1*len(y), ], ], bread, atol=1e-7)
+
+    def test_capprox(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.mean(y)
+        bread = compute_bread(psi, theta=[mean, ], deriv_method='capprox')
+        npt.assert_allclose([[1*len(y), ], ], bread, atol=1e-7)
+
+    def test_exact(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.mean(y)
+        bread = compute_bread(psi, theta=[mean, ], deriv_method='exact')
+        npt.assert_allclose([[1*len(y), ], ], bread, atol=1e-7)
+
+    def test_approx_2d(self, y, x):
+        def psi(theta):
+            return [y - theta[0], x - theta[1]]
+
+        mean_y = np.mean(y)
+        mean_x = np.mean(x)
+        bread = compute_bread(psi, theta=[mean_y, mean_x], deriv_method='approx')
+        npt.assert_allclose([[1*len(y), 0], [0, 1*len(x)]],
+                            bread, atol=1e-7)
+
+    def test_fapprox_2d(self, y, x):
+        def psi(theta):
+            return [y - theta[0], x - theta[1]]
+
+        mean_y = np.mean(y)
+        mean_x = np.mean(x)
+        bread = compute_bread(psi, theta=[mean_y, mean_x], deriv_method='fapprox')
+        npt.assert_allclose([[1*len(y), 0], [0, 1*len(x)]],
+                            bread, atol=1e-7)
+
+    def test_bapprox_2d(self, y, x):
+        def psi(theta):
+            return [y - theta[0], x - theta[1]]
+
+        mean_y = np.mean(y)
+        mean_x = np.mean(x)
+        bread = compute_bread(psi, theta=[mean_y, mean_x], deriv_method='bapprox')
+        npt.assert_allclose([[1*len(y), 0], [0, 1*len(x)]],
+                            bread, atol=1e-7)
+
+    def test_capprox_2d(self, y, x):
+        def psi(theta):
+            return [y - theta[0], x - theta[1]]
+
+        mean_y = np.mean(y)
+        mean_x = np.mean(x)
+        bread = compute_bread(psi, theta=[mean_y, mean_x], deriv_method='capprox')
+        npt.assert_allclose([[1*len(y), 0], [0, 1*len(x)]],
+                            bread, atol=1e-7)
+
+    def test_exact_2d(self, y, x):
+        def psi(theta):
+            return [y - theta[0], x - theta[1]]
+
+        mean_y = np.mean(y)
+        mean_x = np.mean(x)
+        bread = compute_bread(psi, theta=[mean_y, mean_x], deriv_method='exact')
+        npt.assert_allclose([[1*len(y), 0], [0, 1*len(x)]],
+                            bread, atol=1e-7)
+
+
+class TestMeat:
+
+    def test_1d(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.mean(y)
+        meat = compute_meat(psi, theta=[mean, ]) / len(y)
+        npt.assert_allclose(np.var(y, ddof=0), meat, atol=1e-7)
+
+    def test_2d(self, y, x):
+        def psi(theta):
+            return [y - theta[0], x - theta[1]]
+
+        mean_y = np.mean(y)
+        mean_x = np.mean(x)
+        meat = compute_meat(psi, theta=[mean_y, mean_x]) / len(y)
+        npt.assert_allclose(np.cov(y, x, ddof=0), meat, atol=1e-7)
+
+
+class TestBuildSandwich:
+
+    def test_nan(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.nan
+        bread = compute_bread(psi, theta=[mean, ], deriv_method='approx')
+        meat = compute_meat(psi, theta=[mean, ])
+        sandwich = build_sandwich(bread=bread, meat=meat)
+        assert np.isnan(sandwich)
+
+    def test_solve_1d(self, y):
+        def psi(theta):
+            return y - theta
+
+        mean = np.mean(y)
+        bread = compute_bread(psi, theta=[mean, ], deriv_method='approx') / len(y)
+        meat = compute_meat(psi, theta=[mean, ]) / len(y)
+        sandwich = build_sandwich(bread=bread, meat=meat)
+        npt.assert_allclose(np.var(y, ddof=0), sandwich, atol=1e-7)
+
+    def test_solve_2d(self, y, x):
+        def psi(theta):
+            return [y - theta[0], x - theta[1]]
+
+        mean_y = np.mean(y)
+        mean_x = np.mean(x)
+        bread = compute_bread(psi, theta=[mean_y, mean_x], deriv_method='approx') / len(y)
+        meat = compute_meat(psi, theta=[mean_y, mean_x]) / len(y)
+        sandwich = build_sandwich(bread=bread, meat=meat)
+        npt.assert_allclose(np.cov(y, x, ddof=0), sandwich, atol=1e-7)
+
+
+class TestComputeSandwich:
+
+    def test_docs_example(self):
+        def psi(theta):
+            return ee_mean_variance(theta=theta, y=y_dat)
+
+        y_dat = [1, 2, 4, 1, 2, 3, 1, 5, 2]
+        mean = np.mean(y_dat)
+        var = np.var(y_dat, ddof=0)
+        sandwich = compute_sandwich(stacked_equations=psi, theta=[mean, var]) / len(y_dat)
+        npt.assert_allclose([[np.var(y_dat, ddof=0) / len(y_dat), 0.20576132],
+                             [0.20576132, 0.48834019]],
+                            sandwich, rtol=1e-6)
+
+    def test_compute_versus_build(self):
+        d = pd.DataFrame()
+        d['X'] = [1, -1, 0, 1, 2, 1, -2, -1, 0, 3, -3, 1, 1, -1, -1, -2, 2, 0, -1, 0]
+        d['Z'] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        d['Y'] = [0, 0, 0, 0, 0, 15, 15, 25, 25, 45, 0, 0, 0, 0, 15, 15, 15, 25, 25, 35]
+        d['I'] = 1
+
+        # M-estimation negative binomial
+        def psi(theta):
+            ef_glm = ee_glm(theta[:-1], X=d[['I', 'X', 'Z']], y=d['Y'],
+                            distribution='nb', link='log')
+            ef_ta = np.ones(d.shape[0]) * (np.exp(theta[-2]) - theta[-1])
+            return np.vstack([ef_glm, ef_ta])
+
+        mestr = MEstimator(psi, init=[0., 0., 0., -2., 1.])
+        mestr.estimate(solver='lm', maxiter=5000)
+        var_build = mestr.variance
+
+        # Compute sandwich calculations
+        var_compute = compute_sandwich(psi, theta=mestr.theta, deriv_method='approx') / d.shape[0]
+
+        # Checking variance estimates
+        npt.assert_allclose(var_build, var_compute, atol=1e-7)

From 54f7892e282f8a599d782c64ea85c50e443161c2 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sat, 6 Apr 2024 10:22:48 -0400
Subject: [PATCH 11/18] revise how compute_sandwich handles bad bread

---
 delicatessen/mestimation.py | 9 ++++++---
 delicatessen/sandwich.py    | 2 +-
 tests/test_MEstimation.py   | 2 +-
 tests/test_sandwich.py      | 2 +-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/delicatessen/mestimation.py b/delicatessen/mestimation.py
index c42566c..7b87d2c 100644
--- a/delicatessen/mestimation.py
+++ b/delicatessen/mestimation.py
@@ -316,7 +316,10 @@ def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='appr
         self.asymptotic_variance = build_sandwich(bread=self.bread,
                                                   meat=self.meat,
                                                   allow_pinv=allow_pinv)
-        self.variance = self.asymptotic_variance / self.n_obs
+        if self.asymptotic_variance is None:
+            self.variance = self.asymptotic_variance
+        else:
+            self.variance = self.asymptotic_variance / self.n_obs
 
     def confidence_intervals(self, alpha=0.05):
         r"""Calculate Wald-type :math:`(1 - \alpha) \times` 100% confidence intervals using the point estimates and
@@ -343,7 +346,7 @@ def confidence_intervals(self, alpha=0.05):
             intervals for :math:`\theta_b`
         """
         # Check that estimate() has been called
-        if self.variance is None or np.isnan(self.variance):
+        if self.variance is None:
             raise ValueError("Either theta has not been estimated yet, or there is a np.nan in the bread matrix. "
                              "Therefore, confidence_intervals() cannot be called.")
         # Check valid alpha value is being provided
@@ -382,7 +385,7 @@ def z_scores(self, null=0):
             Array of Z-scores for :math:`\theta_1, ..., \theta_b`, respectively
         """
         # Check that self.estimate() has been called
-        if self.variance is None or np.isnan(self.variance):
+        if self.variance is None:
             raise ValueError("Either theta has not been estimated yet, or there is a np.nan in the bread matrix. "
                              "Therefore, z_scores() cannot be called.")
 
diff --git a/delicatessen/sandwich.py b/delicatessen/sandwich.py
index a0ee89e..5b76377 100644
--- a/delicatessen/sandwich.py
+++ b/delicatessen/sandwich.py
@@ -265,7 +265,7 @@ def build_sandwich(bread, meat, allow_pinv=True):
     """
     # Check if there is an issue with the bread matrix
     if np.any(np.isnan(bread)):                                   # If bread contains NaN, breaks
-        return np.nan                                             # ... so give back a NaN
+        return None                                               # ... so give back a NaN
 
     # Compute the bread inversion
     if allow_pinv:                                                 # Allowing the pseudo-inverse
diff --git a/tests/test_MEstimation.py b/tests/test_MEstimation.py
index d6e24a5..3ad083c 100644
--- a/tests/test_MEstimation.py
+++ b/tests/test_MEstimation.py
@@ -157,7 +157,7 @@ def psi(theta):
 
         # Ensuring variance is None but point estimates still exist
         assert mestr.theta is not None
-        assert np.isnan(mestr.variance)
+        assert mestr.variance is None
 
     def test_mean_variance_1eq(self):
         """Tests the mean / variance with a single estimating equation.
diff --git a/tests/test_sandwich.py b/tests/test_sandwich.py
index ef2a88f..3b16ecc 100644
--- a/tests/test_sandwich.py
+++ b/tests/test_sandwich.py
@@ -159,7 +159,7 @@ def psi(theta):
         bread = compute_bread(psi, theta=[mean, ], deriv_method='approx')
         meat = compute_meat(psi, theta=[mean, ])
         sandwich = build_sandwich(bread=bread, meat=meat)
-        assert np.isnan(sandwich)
+        assert sandwich is None
 
     def test_solve_1d(self, y):
         def psi(theta):

From b18caa9702e8447a8353da97cdf50d52755f817d Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sun, 7 Apr 2024 10:52:18 -0400
Subject: [PATCH 12/18] Adding IJE paper to intro section

---
 docs/index.rst | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index 1d05f57..03c77a3 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -9,10 +9,10 @@ recommend looking into ``geex`` (`Saul & Hudgens (2020) <https://bsaul.github.io
 ``delicatessen`` supports a variety of pre-built estimating equations as well as custom, user built estimating
 equations.
 
-Here, we provide a brief overview of M-Estimation. For a more detailed, please refer to Stefanski & Boos (2002) or
-Boos & Stefanski (2013). M-estimation was developed to study the large sample properties of robust statistics. However,
-many common large-sample statistics can be expressed with estimating equations, so M-estimation provides a unified
-structure and a streamlined approach to estimation. Let the parameter of interest be the
+Here, we provide a brief overview of M-Estimation. For a more detailed, please refer to Ross et al. (2024),
+Stefanski & Boos (2002), or Boos & Stefanski (2013). M-estimation was developed to study the large sample properties
+of robust statistics. However, many common large-sample statistics can be expressed with estimating equations, so
+M-estimation provides a unified structure and a streamlined approach to estimation. Let the parameter of interest be the
 vector :math:`\theta = (\theta_1, \theta_2, ..., \theta_v)` and data is observed for :math:`n` independent units
 :math:`O_1, O_2, …, O_n`. An M-estimator, :math:`\hat{\theta}`, is the solution to the estimating equation
 :math:`\sum_{i=1}^{n} \psi(O_i, \hat{\theta}) = 0` where :math:`\psi` is a known :math:`v \times 1`-dimension estimating
@@ -99,13 +99,16 @@ Otherwise, you may contact me via email (gmail: zivich.5).
 References
 -----------------------------
 
-Zivich PN, Klose M, Cole SR, Edwards JK, & Shook-Sa BE. (2022). Delicatessen: M-Estimation in Python.
-*arXiv preprint arXiv:2203.11300*.
-
-Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. *The American Statistician*, 56(1), 29-38.
-
 Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In *Essential Statistical Inference*
 (pp. 297-337). Springer, New York, NY.
 
 Saul BC, & Hudgens MG. (2020). The Calculus of M-Estimation in R with geex. *Journal of Statistical Software*,
-92(2).
\ No newline at end of file
+92(2).
+
+Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. *The American Statistician*, 56(1), 29-38.
+
+Ross RK, Zivich PN, Stringer JSA, & Cole SR. (2024). M-estimation for common epidemiological measures: introduction and
+applied examples. *International Journal of Epidemiology*, 53(2), dyae030.
+
+Zivich PN, Klose M, Cole SR, Edwards JK, & Shook-Sa BE. (2022). Delicatessen: M-Estimation in Python.
+*arXiv preprint arXiv:2203.11300*.

From 0b5419578b2f5fdfba229ee300dfca21ee4007bc Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Fri, 12 Apr 2024 10:55:51 -0400
Subject: [PATCH 13/18] Updating README

---
 README.md      | 12 ++++++------
 docs/index.rst |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 11760d5..6d6c415 100644
--- a/README.md
+++ b/README.md
@@ -18,10 +18,10 @@ of M-estimation.
 ## M-Estimation and Estimating Equations
 
 Here, we provide a brief overview of M-estimation theory. For more detailed introductions to M-estimation, see Ross
-et al. (2024) or Chapter 7 of Boos & Stefanski (2013). M-estimation is a generalization of likelihood-based methods.
-*M-estimators* are solutions to estimating equations. To apply the M-estimator, we solve the estimating equations using
-observed data. This is similar to other approaches, but the key advantage of M-Estimators is estimation of the variance
-via the sandwich variance.
+et al. (2024), Stefanski & Boos (2002), or Chapter 7 of Boos & Stefanski (2013). M-estimation is a generalization of
+likelihood-based methods. *M-estimators* are solutions to estimating equations. To apply the M-estimator, we solve the
+estimating equations using observed data. This is similar to other approaches, but the key advantage of M-Estimators is
+variance estimation via the empirical sandwich variance estimator.
 
 While M-Estimation is a powerful tool, the derivatives and matrix algebra can quickly become unwieldy. This is where 
 `delicatessen` comes in. `delicatessen` takes estimating functions and data, and solves for the parameter estimates,
@@ -84,10 +84,10 @@ at [delicatessen website](https://deli.readthedocs.io/en/latest/).
 Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
 (pp. 297-337). Springer, New York, NY.
 
+Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. *The American Statistician*, 56(1), 29-38.
+
 Ross RK, Zivich PN, Stringer JS, & Cole SR. (2024). M-estimation for common epidemiological measures: introduction and
 applied examples. *International Journal of Epidemiology*, 53(2).
 
-Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. *The American Statistician*, 56(1), 29-38.
-
 Zivich PN, Klose M, Cole SR, Edwards JK, & Shook-Sa BE. (2022). Delicatessen: M-Estimation in Python.
 *arXiv preprint arXiv:2203.11300*.
diff --git a/docs/index.rst b/docs/index.rst
index 03c77a3..da55acc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -54,9 +54,9 @@ To install ``delicatessen``, use the following command in terminal or command pr
 
 Only two dependencies for ``delicatessen`` are: ``NumPy``, ``SciPy``.
 
-While pandas is not necessary, several examples are demonstrated with pandas for ease of data processing. To replicate
-the tests in ``tests/`` you will need to install ``pandas``, ``statsmodels`` and ``pytest`` (but these are not necessary
-for use of the package).
+While ``pandas`` is not a dependency, several examples are demonstrated with pandas for ease of data processing. To
+replicate the tests in ``tests/`` you will need to also install ``pandas``, ``statsmodels`` and ``pytest`` (but these
+are not necessary for use of the package).
 
 Citation:
 -------------

From ed05ce72c5c84798db16be7a18d1ed7e34d403be Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Mon, 22 Apr 2024 21:11:19 -0400
Subject: [PATCH 14/18] Adding extended Rogan-Gladen

---
 .../estimating_equations/measurement.py       | 167 +++++++++++++++++-
 tests/test_ee_measurement.py                  | 148 +++++++++++++++-
 2 files changed, 307 insertions(+), 8 deletions(-)

diff --git a/delicatessen/estimating_equations/measurement.py b/delicatessen/estimating_equations/measurement.py
index ac44e73..51618da 100644
--- a/delicatessen/estimating_equations/measurement.py
+++ b/delicatessen/estimating_equations/measurement.py
@@ -21,20 +21,22 @@ def ee_rogan_gladen(theta, y, y_star, r, weights=None):
             \mu \times \left\{ \alpha + \beta - 1 \right\} - \left\{ \mu^* + \beta - 1 \right\} \\
             R_i (Y_i^* - \mu^*) \\
             (1-R_i) Y_i \left\{ Y^*_i - \beta \right\} \\
-            (1-R_i) (1-Y_i) \left\{ Y^*_i - \alpha \right\} \\
+            (1-R_i) (1-Y_i) \left\{ (1 - Y^*_i) - \alpha \right\} \\
         \end{bmatrix}
         = 0
 
-    where :math:`Y` is the true value of the outcome, :math:`Y^*` is the mismeasured value of the outcome. The first
+    where :math:`Y` is the true value of the outcome, :math:`Y^*` is the mismeasured value of the outcome, :math:`R` is
+    the indicator for the main study data, :math:`\mu` is the corrected mean, :math:`\mu^*` is the mismeasured mean in
+    the main study data, :math:`\beta` is the sensitivity, and :math:`\alpha` is the specificity. The first
     estimating equation is the corrected proportion, the second is the naive proportion, the third is for sensitivity,
     and the fourth for specificity.
 
-    Here, :math:`\theta` is a 1-by-4 array.
+    Here, ``theta`` is a 1-by-4 array.
 
     Note
     ----
     The Rogan-Gladen estimator may provide corrected proportions outside of :math:`[0,1]` when
-    :math:`\alpha + \beta \le 1`, or the addition of sensitivity and specificity is less than or equal to one.
+    :math:`\alpha + \beta \le 1`.
 
     Parameters
     ----------
@@ -67,7 +69,7 @@ def ee_rogan_gladen(theta, y, y_star, r, weights=None):
     >>> from delicatessen import MEstimator
     >>> from delicatessen.estimating_equations import ee_rogan_gladen
 
-    Replicating the example from Cole et al. (2023).
+    Replicating the published example from Cole et al. (2023).
 
     >>> d = pd.DataFrame()
     >>> d['Y_star'] = [0, 1] + [0, 1, 0, 1]
@@ -99,7 +101,8 @@ def ee_rogan_gladen(theta, y, y_star, r, weights=None):
 
     >>> estr.theta[0]
 
-    Inverse probability weights can be used through the ``weights`` argument.
+    Inverse probability weights can be used through the ``weights`` argument. See the applied examples for a
+    demonstration.
 
     References
     ----------
@@ -108,6 +111,9 @@ def ee_rogan_gladen(theta, y, y_star, r, weights=None):
 
     Rogan WJ & Gladen B. (1978). Estimating prevalence from the results of a screening test.
     *American Journal of Epidemiology*, 107(1), 71-76.
+
+    Ross RK, Zivich PN, Stringer JSA, & Cole SR. (2024). M-estimation for common epidemiological measures: introduction
+    and applied examples. *International Journal of Epidemiology*, 53(2), dyae030.
     """
     # Processing inputs
     y = np.asarray(y)                           # Convert to NumPy array
@@ -138,3 +144,152 @@ def ee_rogan_gladen(theta, y, y_star, r, weights=None):
                       ee_naive_mean,            # Naive mean
                       ee_sens,                  # Sensitivity model parameters
                       ee_spec])                 # Specificity model parameters
+
+
+def ee_rogan_gladen_extended(theta, y, y_star, r, X, weights=None):
+    r"""Estimating equation for the extended Rogan-Gladen correction for mismeasured *binary* outcomes. This estimator
+    uses external data to estimate the sensitivity and specificity conditional on covariates, and then uses those
+    external estimates to correct the estimated proportion. The general form of the estimating equations are
+
+    .. math::
+
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            R_i \times \left\{ \frac{Y^* + m(X_i; \beta) - 1}{m(X_i; \alpha) + m(X_i; \beta) - 1}  - \mu \right\} \\
+            (1-R_i) Y_i \left\{ Y^*_i - m(X_i; \beta) \right\} X_i^T \\
+            (1-R_i) (1 - Y_i) \left\{ (1 - Y^*_i) - m(X_i; \beta) \right\} X_i^T \\
+        \end{bmatrix}
+        = 0
+
+    where :math:`Y` is the true value of the outcome, :math:`Y^*` is the mismeasured value of the outcome. The first
+    estimating equation is the corrected proportion, the second is for sensitivity, and the third for specificity.
+
+    If :math:`X` is of dimension :math:`p`, then ``theta`` is a 1-by-(1+2p) array. Note that the design matrix is
+    shared across the sensitivity and specificity models.
+
+    Note
+    ----
+    The Rogan-Gladen estimator may provide corrected proportions outside of :math:`[0,1]` when
+    :math:`\alpha + \beta \le 1`, or the addition of sensitivity and specificity is less than or equal to one.
+
+    Parameters
+    ----------
+    theta : ndarray, list, vector
+        Theta consists of 4 values.
+    y : ndarray, list, vector
+        1-dimensional vector of n observed values. These are the gold-standard :math:`Y` measurements in the external
+        sample. All values should be either 0 or 1, and be non-missing among those with :math:`R=0`.
+    y_star : ndarray, list, vector
+        1-dimensional vector of n observed values. These are the mismeasured :math:`Y` values. All values should be
+        either 0 or 1, and be non-missing among all observations.
+    r : ndarray, list, vector
+        1-dimensional vector of n indicators regarding whether an observation was part of the external validation data.
+        Indicator should designate if observations are the main data.
+    X : ndarray, list, vector
+        2-dimensional vector of a design matrix for the sensitivity and specificity models.
+    weights : ndarray, list, vector, None, optional
+        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+
+    Returns
+    -------
+    array :
+        Returns a 4-by-n NumPy array evaluated for the input ``theta``
+
+    Examples
+    --------
+    Construction of a estimating equation(s) with ``ee_rogan_gladen_extended`` should be done similar to the following
+
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> from scipy.stats import logistic
+    >>> from delicatessen import MEstimator
+    >>> from delicatessen.estimating_equations import ee_rogan_gladen_extended
+
+    Replicating the example from Cole et al. (2023).
+
+    >>> d = pd.DataFrame()
+    >>> d['Y_star'] = [0, 1] + [0, 1, 0, 1]
+    >>> d['Y'] = [np.nan, np.nan] + [0, 0, 1, 1]
+    >>> d['S'] = [1, 1] + [0, 0, 0, 0]
+    >>> d['n'] = [270, 680] + [71, 18, 38, 203]
+    >>> d = pd.DataFrame(np.repeat(d.values, d['n'], axis=0), columns=d.columns)
+    >>> d['C'] = 1
+
+    Applying the Rogan-Gladen correction to this example
+
+    >>> def psi(theta):
+    >>>     return ee_rogan_gladen_extended(theta=theta, y=d['Y'],
+    >>>                                     y_star=d['Y_star'],
+    >>>                                     X=d[['C', ]], r=d['S'])
+
+    Notice that ``y`` corresponds to the gold-standard outcomes (only available where R=0), ``y_star`` corresponds to
+    the mismeasured covariate data (available for R=1 and R=0), and ``r`` corresponds to the indicator for the main
+    data source. Now we can call the M-Estimator.
+
+    >>> estr = MEstimator(psi, init=[0.5, 1., 1.])
+    >>> estr.estimate(solver='lm')
+
+    Inspecting the parameter estimates, variance, and 95% confidence intervals
+
+    >>> estr.theta
+    >>> estr.variance
+    >>> estr.confidence_intervals()
+
+    Note
+    ----
+    The sensitivity and specificity in ``ee_rogan_gladen_extended`` correspond to the logit transformations, unlike
+    ``ee_rogan_gladen`` which returns the sensitivity and specificity directly.
+
+
+    The corrected proportion is
+
+    >>> estr.theta[0]
+
+    Inverse probability weights can be used through the ``weights`` argument. See the applied examples for a
+    demonstration.
+
+    References
+    ----------
+    Cole SR, Edwards JK, Breskin A, Rosin S, Zivich PN, Shook-Sa BE, & Hudgens MG. (2023). Illustration of 2 Fusion
+    Designs and Estimators. *American Journal of Epidemiology*, 192(3), 467-474.
+
+    Rogan WJ & Gladen B. (1978). Estimating prevalence from the results of a screening test.
+    *American Journal of Epidemiology*, 107(1), 71-76.
+
+    Ross RK, Cole SR, Edwards JK, Zivich PN, Westreich D, Daniels JL, Price JT & Stringer JSA. (2024). Leveraging
+    External Validation Data: The Challenges of Transporting Measurement Error Parameters. *Epidemiology*,
+    35(2), 196-207.
+    """
+    # Processing inputs
+    y = np.asarray(y)                           # Convert to NumPy array
+    y_star = np.asarray(y_star)                 # Convert to NumPy array
+    r = np.asarray(r)                           # Convert to NumPy array
+    X = np.asarray(X)                           # Convert to NumPy array
+    if weights is None:                         # Handle weights argument
+        weights = 1                             # ... set all weight as 1
+    else:                                       # Otherwise
+        weights = np.asarray(weights)           # ... convert to NumPy array
+
+    # Preparing data for estimating equation operations
+    nXp = X.shape[1] + 1                        # Index start for the NumPy matrices
+    y = np.where(r == 1, -999, y)               # Removing NaN (or any other indicators) for Y in main
+    mu = theta[0]                               # Parameter of interest
+    sens = theta[1:nXp]                         # Parameters for sensitivity model
+    spec = theta[nXp:]                          # Parameters for specificity model
+
+    # Nuisance models for sensitivity
+    ee_sens = ee_regression(theta=sens, y=y_star, X=X,
+                            model='logistic', weights=weights) * (1-r) * y
+    sens_i = inverse_logit(np.dot(X, sens))     # Predicted sensitivity for each unit
+
+    # Nuisance models for specificity
+    ee_spec = ee_regression(theta=spec, y=1-y_star, X=X,
+                            model='logistic', weights=weights) * (1-r) * (1-y)
+    spec_i = inverse_logit(np.dot(X, spec))     # Predicted specificity for each unit
+
+    # Estimating equation for the individual-level version of the Rogan-Gladen correction
+    rg_equation = (y_star + spec_i - 1) / (sens_i + spec_i - 1)
+    ee_corr_mean = r * (rg_equation - mu) * weights
+
+    # Returning the stacked estimating equations
+    return np.vstack([ee_corr_mean, ee_sens, ee_spec])
diff --git a/tests/test_ee_measurement.py b/tests/test_ee_measurement.py
index dd03f5c..a2de2b8 100644
--- a/tests/test_ee_measurement.py
+++ b/tests/test_ee_measurement.py
@@ -8,8 +8,8 @@
 import pandas as pd
 
 from delicatessen import MEstimator
-from delicatessen.estimating_equations import ee_regression, ee_rogan_gladen
-from delicatessen.utilities import inverse_logit
+from delicatessen.estimating_equations import ee_regression, ee_rogan_gladen, ee_rogan_gladen_extended
+from delicatessen.utilities import inverse_logit, logit
 
 
 class TestEstimatingEquationsMeasurement:
@@ -40,6 +40,34 @@ def cole_roses_data(self):
         d['C'] = 1
         return d
 
+    @pytest.fixture
+    def data_covs(self):
+        # Compact data set for X=1
+        dc = pd.DataFrame()
+        dc['Y_star'] = [0, 1] + [0, 1, 0, 1]
+        dc['Y'] = [np.nan, np.nan] + [0, 0, 1, 1]
+        dc['S'] = [1, 1] + [0, 0, 0, 0]
+        dc['n'] = [400, 400] + [75, 25, 5, 95]
+        d1 = pd.DataFrame(np.repeat(dc.values, dc['n'], axis=0), columns=dc.columns)
+        d1 = d1[['Y', 'Y_star', 'S']].copy()
+        d1['C'] = 1
+        d1['X'] = 1
+        d1['weights'] = 1
+
+        # Compact data set for X=1
+        dc = pd.DataFrame()
+        dc['Y_star'] = [0, 1] + [0, 1, 0, 1]
+        dc['Y'] = [np.nan, np.nan] + [0, 0, 1, 1]
+        dc['S'] = [1, 1] + [0, 0, 0, 0]
+        dc['n'] = [100, 100] + [85, 15, 20, 80]
+        d0 = pd.DataFrame(np.repeat(dc.values, dc['n'], axis=0), columns=dc.columns)
+        d0 = d0[['Y', 'Y_star', 'S']].copy()
+        d0['C'] = 1
+        d0['X'] = 0
+        d0['weights'] = np.where(d0['Y'].isna(), 2, 1)
+
+        return pd.concat([d1, d0])
+
     def test_rogan_gladen(self, cole2023_data):
         # Replicate Cole et al. 2023 Rogan-Gladen example as a test
 
@@ -112,3 +140,119 @@ def psi(theta):
         # Checking variance estimate
         npt.assert_allclose(estr.variance[0:4, 0:4], reference_covar,
                             atol=1e-6)
+
+    def test_extended_rogan_gladen(self, cole2023_data):
+        # Replicate Cole et al. 2023 Rogan-Gladen as a comparison for the extended version
+
+        def psi(theta):
+            return ee_rogan_gladen(theta,
+                                   y=cole2023_data['Y'],
+                                   y_star=cole2023_data['Y_star'],
+                                   r=cole2023_data['S'])
+
+        estr0 = MEstimator(psi, init=[0.5, 0.5, .75, .75])
+        estr0.estimate(solver='lm')
+
+        def psi(theta):
+            return ee_rogan_gladen_extended(theta=theta, y=cole2023_data['Y'],
+                                            y_star=cole2023_data['Y_star'],
+                                            X=cole2023_data[['C', ]],
+                                            r=cole2023_data['S'])
+
+        estr1 = MEstimator(psi, init=[0.5, logit(0.75), logit(0.75), ])
+        estr1.estimate(solver='lm')
+
+        # Checking mean estimate
+        npt.assert_allclose(estr0.theta[0], estr1.theta[0], atol=1e-6)
+        npt.assert_allclose(estr0.theta[2], inverse_logit(estr1.theta[1]), atol=1e-6)
+        npt.assert_allclose(estr0.theta[3], inverse_logit(estr1.theta[2]), atol=1e-6)
+
+        # Checking variance estimate
+        npt.assert_allclose(estr0.variance[0, 0], estr1.variance[0, 0], atol=1e-6)
+
+    def test_rogan_gladen_extended_weights(self, cole_roses_data):
+        # Replicate Cole et al. 202x Rejoinder example as a test
+        dr = cole_roses_data
+        y_no_nan = np.asarray(dr['Y'].fillna(-9))
+        ystar_no_nan = np.asarray(dr['Y_star'].fillna(-9))
+        w_no_nan = np.asarray(dr[['C', 'W']].fillna(-9))
+        s1 = np.where(dr['S'] == 1, 1, 0)
+        s2 = np.where(dr['S'] == 2, 1, 0)
+        s3 = np.where(dr['S'] == 3, 1, 0)
+
+        def psi(theta):
+            param = theta[:3]
+            beta = theta[3:]
+
+            # Inverse odds weights model
+            ee_sm = ee_regression(beta, X=w_no_nan, y=s2,
+                                  model='logistic')
+            ee_sm = ee_sm * (1 - s3)
+            pi_s = inverse_logit(np.dot(w_no_nan, beta))
+            iosw = s1 * pi_s / (1-pi_s) + s3
+
+            # Rogan-Gladen
+            ee_rg = ee_rogan_gladen_extended(param,
+                                             y=y_no_nan,
+                                             y_star=ystar_no_nan,
+                                             r=s1, X=dr[['C', ]],
+                                             weights=iosw)
+            ee_rg = ee_rg * (1 - s2)
+            return np.vstack([ee_rg, ee_sm])
+
+        estr = MEstimator(psi, init=[0.5, .75, .75, 0., 0.])
+        estr.estimate(solver='lm')
+
+        reference_theta = [0.0967144999, logit(0.9), logit(0.8)]
+
+        # Checking mean estimate
+        npt.assert_allclose(estr.theta[0:3], reference_theta,
+                            atol=1e-6)
+
+    def test_rogan_gladen_extended_covs(self, data_covs):
+        d = data_covs
+
+        def psi(theta):
+            return ee_rogan_gladen_extended(theta=theta, y=d['Y'], y_star=d['Y_star'],
+                                            X=d[['C', 'X']], r=d['S'])
+
+        estr = MEstimator(psi, init=[0.5, 1., 0., 1., 0., ])
+        estr.estimate(solver='lm')
+
+        # Checking sensitivity
+        npt.assert_allclose(.8, inverse_logit(estr.theta[1]), atol=1e-7)
+        npt.assert_allclose(.95, inverse_logit(estr.theta[1] + estr.theta[2]), atol=1e-7)
+
+        # Checking specificity
+        npt.assert_allclose(.85, inverse_logit(estr.theta[3]), atol=1e-7)
+        npt.assert_allclose(.75, inverse_logit(estr.theta[3] + estr.theta[4]), atol=1e-7)
+
+        # Checking corrected mean
+        pr_x1 = 0.8
+        corrected_mu = ((1-pr_x1) * ((0.5 + 0.85 - 1) / (0.85 + 0.8 - 1))
+                        + pr_x1 * ((0.5 + 0.75 - 1) / (0.75 + 0.95 - 1)))
+        npt.assert_allclose(corrected_mu, estr.theta[0], atol=1e-7)
+
+    def test_rogan_gladen_extended_covs_weights(self, data_covs):
+        d = data_covs
+
+        def psi(theta):
+            return ee_rogan_gladen_extended(theta=theta, y=d['Y'], y_star=d['Y_star'],
+                                            X=d[['C', 'X']], r=d['S'], weights=d['weights'])
+
+        estr = MEstimator(psi, init=[0.5, 1., 0., 1., 0., ])
+        estr.estimate(solver='lm')
+
+        # Checking sensitivity
+        npt.assert_allclose(.8, inverse_logit(estr.theta[1]), atol=1e-7)
+        npt.assert_allclose(.95, inverse_logit(estr.theta[1] + estr.theta[2]), atol=1e-7)
+
+        # Checking specificity
+        npt.assert_allclose(.85, inverse_logit(estr.theta[3]), atol=1e-7)
+        npt.assert_allclose(.75, inverse_logit(estr.theta[3] + estr.theta[4]), atol=1e-7)
+
+        # Checking corrected mean
+        pr_x1 = 800 / 1200
+        corrected_mu = ((1-pr_x1) * ((0.5 + 0.85 - 1) / (0.85 + 0.8 - 1))
+                        + pr_x1 * ((0.5 + 0.75 - 1) / (0.75 + 0.95 - 1)))
+        npt.assert_allclose(corrected_mu, estr.theta[0], atol=1e-7)

From e9dc9163eee0200f9c4d4c52fcc0cd005588885e Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Mon, 22 Apr 2024 21:11:31 -0400
Subject: [PATCH 15/18] Extended RG to init

---
 delicatessen/estimating_equations/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/delicatessen/estimating_equations/__init__.py b/delicatessen/estimating_equations/__init__.py
index 2339eef..7f25bd6 100644
--- a/delicatessen/estimating_equations/__init__.py
+++ b/delicatessen/estimating_equations/__init__.py
@@ -7,7 +7,7 @@
 from .dose_response import (ee_4p_logistic, ee_3p_logistic, ee_2p_logistic,
                             ee_effective_dose_delta)
 
-from .measurement import (ee_rogan_gladen,
+from .measurement import (ee_rogan_gladen, ee_rogan_gladen_extended
                           )
 
 from .regression import (ee_regression, ee_glm, ee_mlogit,

From 0bf0ff39084b4bb137e228b579e7fda7b2209c54 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Mon, 22 Apr 2024 21:23:27 -0400
Subject: [PATCH 16/18] Updating docs for new additions

---
 delicatessen/sandwich.py                              |  4 ++--
 docs/Reference/Estimating Equations.rst               |  1 +
 docs/Reference/M-Estimator.rst                        | 11 +++++++++++
 ...equations.measurement.ee_rogan_gladen_extended.rst |  6 ++++++
 .../delicatessen.sandwich.compute_sandwich.rst        |  6 ++++++
 5 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 docs/Reference/generated/delicatessen.estimating_equations.measurement.ee_rogan_gladen_extended.rst
 create mode 100644 docs/Reference/generated/delicatessen.sandwich.compute_sandwich.rst

diff --git a/delicatessen/sandwich.py b/delicatessen/sandwich.py
index 5b76377..e337345 100644
--- a/delicatessen/sandwich.py
+++ b/delicatessen/sandwich.py
@@ -14,7 +14,7 @@
 
 
 def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, allow_pinv=True):
-    """Compute the empirical sandwich variance estimator given a set of estimating equations and parameter estimates.
+    r"""Compute the empirical sandwich variance estimator given a set of estimating equations and parameter estimates.
     Note that this functionality does not solve for the parameter estimates (unlike ``MEstimator``). Instead, it
     only computes the sandwich for the provided value.
 
@@ -24,7 +24,7 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
 
         V_n(O_i; \theta) = B_n(O_i; \theta)^{-1} F_n(O_i; \theta) \left[ B_n(O_i; \theta)^{-1} \right]^{T}
 
-    where :math:`B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial \psi(O_i; \theta)}{\partial \theta}`,
+    where :math:`B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial}{\partial \theta} \psi(O_i; \theta)`,
     :math:`F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T`, and :math:`\psi(O_i; \theta)` is the
     estimating function.
 
diff --git a/docs/Reference/Estimating Equations.rst b/docs/Reference/Estimating Equations.rst
index c80c5dd..41b1d2e 100644
--- a/docs/Reference/Estimating Equations.rst	
+++ b/docs/Reference/Estimating Equations.rst	
@@ -47,6 +47,7 @@ Measurement
   :toctree: generated/
 
   ee_rogan_gladen
+  ee_rogan_gladen_extended
 
 
 Survival
diff --git a/docs/Reference/M-Estimator.rst b/docs/Reference/M-Estimator.rst
index 14a4f89..cbb85c2 100644
--- a/docs/Reference/M-Estimator.rst
+++ b/docs/Reference/M-Estimator.rst
@@ -15,3 +15,14 @@ M-Estimator
 
    MEstimator
 
+
+Sandwich Variance Estimator
+---------------------------
+
+.. currentmodule:: delicatessen.sandwich
+
+.. autosummary::
+   :toctree: generated/
+
+   compute_sandwich
+
diff --git a/docs/Reference/generated/delicatessen.estimating_equations.measurement.ee_rogan_gladen_extended.rst b/docs/Reference/generated/delicatessen.estimating_equations.measurement.ee_rogan_gladen_extended.rst
new file mode 100644
index 0000000..755de8c
--- /dev/null
+++ b/docs/Reference/generated/delicatessen.estimating_equations.measurement.ee_rogan_gladen_extended.rst
@@ -0,0 +1,6 @@
+﻿delicatessen.estimating\_equations.measurement.ee\_rogan\_gladen\_extended
+==========================================================================
+
+.. currentmodule:: delicatessen.estimating_equations.measurement
+
+.. autofunction:: ee_rogan_gladen_extended
\ No newline at end of file
diff --git a/docs/Reference/generated/delicatessen.sandwich.compute_sandwich.rst b/docs/Reference/generated/delicatessen.sandwich.compute_sandwich.rst
new file mode 100644
index 0000000..427cb74
--- /dev/null
+++ b/docs/Reference/generated/delicatessen.sandwich.compute_sandwich.rst
@@ -0,0 +1,6 @@
+﻿delicatessen.sandwich.compute\_sandwich
+=======================================
+
+.. currentmodule:: delicatessen.sandwich
+
+.. autofunction:: compute_sandwich
\ No newline at end of file

From 649adfd9af180d0c8c592df363a164ffb81f640f Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Tue, 23 Apr 2024 08:27:29 -0400
Subject: [PATCH 17/18] Adding Bonate 2011 to examples

---
 docs/Examples/Bonate-Pharmaco.ipynb | 426 ++++++++++++++++++++++++++++
 docs/Examples/index.rst             |   1 +
 2 files changed, 427 insertions(+)
 create mode 100644 docs/Examples/Bonate-Pharmaco.ipynb

diff --git a/docs/Examples/Bonate-Pharmaco.ipynb b/docs/Examples/Bonate-Pharmaco.ipynb
new file mode 100644
index 0000000..5bcc6d4
--- /dev/null
+++ b/docs/Examples/Bonate-Pharmaco.ipynb
@@ -0,0 +1,426 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1f25ba4a",
+   "metadata": {},
+   "source": [
+    "# Bonate (2011): Pharmacokinetic-Pharmacodynamic Modeling and Simulations\n",
+    "\n",
+    "Here, we replicate some of the examples described in Bonate (2011). I recommend following along with the 2nd edition of the book. The purpose of this notebook is to illustrate the versatility of `delicatessen` by illustrating its application for pharmacokinetic modeling. This can easily be done by using the built-in estimating equations, as will be shown.\n",
+    "\n",
+    "Bonate PL. (2011). Pharmacokinetic-Pharmacodynamic Modeling and Simulations. 2nd Edition. Springer, New York, NY.\n",
+    "\n",
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4d95e437",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NumPy version:        1.25.2\n",
+      "SciPy version:        1.11.2\n",
+      "Pandas version:       1.4.1\n",
+      "Delicatessen version: 2.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import scipy as sp\n",
+    "import pandas as pd\n",
+    "import statsmodels.api as sm\n",
+    "import statsmodels.formula.api as smf\n",
+    "\n",
+    "import delicatessen\n",
+    "from delicatessen import MEstimator\n",
+    "from delicatessen.estimating_equations import ee_glm\n",
+    "\n",
+    "np.random.seed(80950841)\n",
+    "\n",
+    "print(\"NumPy version:       \", np.__version__)\n",
+    "print(\"SciPy version:       \", sp.__version__)\n",
+    "print(\"Pandas version:      \", pd.__version__)\n",
+    "print(\"Delicatessen version:\", delicatessen.__version__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5606b501",
+   "metadata": {},
+   "source": [
+    "## Chapter 11: Generalized Linear Models and Its Extensions\n",
+    "\n",
+    "\n",
+    "### Adverse Events Case Study\n",
+    "\n",
+    "The first example from Chapter 11 is the *Case Study: Assessing the Relationship Between Drug Concentrations and Adverse Events Using Logistic Regression*. Data comes from Table 2 of the book. In the book, a variety of different models for different adverse events are considered. Here, we only consider nausea (and vomiting) by AUC. For the one observations with a missing AUC value, they are dropped from the data set (same as the book). For ease of examining the coefficients, we will also divide the AUC value by 1000. This means the coefficients for AUC are rescaled from those reported in the book.\n",
+    "\n",
+    "First, we load the data set and transform the coefficients and add an intercept column to the data set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1be76e83",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Int64Index: 42 entries, 0 to 42\n",
+      "Data columns (total 12 columns):\n",
+      " #   Column     Non-Null Count  Dtype  \n",
+      "---  ------     --------------  -----  \n",
+      " 0   id         42 non-null     int64  \n",
+      " 1   c_max      42 non-null     float64\n",
+      " 2   auc        42 non-null     float64\n",
+      " 3   age        42 non-null     int64  \n",
+      " 4   sex        42 non-null     int64  \n",
+      " 5   ps         42 non-null     int64  \n",
+      " 6   myalgia    42 non-null     int64  \n",
+      " 7   phlebitis  42 non-null     int64  \n",
+      " 8   asthenia   42 non-null     int64  \n",
+      " 9   diarrhea   42 non-null     int64  \n",
+      " 10  nausea     42 non-null     int64  \n",
+      " 11  intercept  42 non-null     int64  \n",
+      "dtypes: float64(2), int64(10)\n",
+      "memory usage: 4.3 KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "d = pd.read_csv(\"data/bonate.csv\").dropna()\n",
+    "d['intercept'] = 1   # Adding intercept to data\n",
+    "d['auc'] /= 1000     # Rescaling AUC\n",
+    "d['c_max'] /= 1000   # Rescaling C_max\n",
+    "d.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "143778ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "table = pd.DataFrame(columns=[\"Model\", \"Intercept\", \"AUC\", \"Sex\", \"Age\", \"PS\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "02260fdc",
+   "metadata": {},
+   "source": [
+    "To begin, we will fit a null (intercept-only) logistic regression model. This is easily done by using the built-in `ee_glm` estimating equation. For the logistic model, we specify a binomial distribution with the logit link. Below is code to setup the estimating equation and then estimate the parameters using `MEstimator`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "207e9a4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def psi(theta):\n",
+    "    # Estimating equation for null model\n",
+    "    return ee_glm(theta=theta,\n",
+    "                  y=d['nausea'],\n",
+    "                  X=d[['intercept', ]],\n",
+    "                  distribution='binomial',\n",
+    "                  link='logit')\n",
+    "\n",
+    "\n",
+    "# Estimate the parameters of the logit model\n",
+    "estr_null = MEstimator(psi, init=[0., ])\n",
+    "estr_null.estimate()\n",
+    "\n",
+    "# Adding results to the output table\n",
+    "table.loc[len(table)] = [\"Null\", estr_null.theta[0], ] + [np.nan, ]*4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e0cb694",
+   "metadata": {},
+   "source": [
+    "Next we fit a logistic regression model that includes linear terms for all the independent variables in the data set. This is easily done by modifying the previous design matrix (i.e., `X`). Below is code to fit the full model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8d17c1bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def psi(theta):\n",
+    "    # Estimating equation for full model\n",
+    "    return ee_glm(theta=theta,\n",
+    "                  y=d['nausea'],\n",
+    "                  X=d[['intercept', 'auc', 'sex', 'age', 'ps']],\n",
+    "                  distribution='binomial',\n",
+    "                  link='logit')\n",
+    "\n",
+    "\n",
+    "# Estimate the parameters of the logit model\n",
+    "estr_full = MEstimator(psi, init=[0., ]*5)\n",
+    "estr_full.estimate()\n",
+    "\n",
+    "# Adding results to the output table\n",
+    "table.loc[len(table)] = [\"Full\", ] + list(estr_full.theta)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f579eda",
+   "metadata": {},
+   "source": [
+    "In the book, Bonate performs some variable selection. In general, we would not recommend use of backwards-selection procedures (like those done in the book). Such procedures complicate inference (P-values and confidence intervals after these procedures are no longer valid). For comparison purposes, we estimate the reduced model reported in the book. Again, this is easily done by modifying the `X` argument for `ee_glm`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6558123b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def psi(theta):\n",
+    "    # Estimating equation for reduced model\n",
+    "    return ee_glm(theta=theta,\n",
+    "                  y=d['nausea'],\n",
+    "                  X=d[['intercept', 'auc', 'sex']],\n",
+    "                  distribution='binomial',\n",
+    "                  link='logit')\n",
+    "\n",
+    "\n",
+    "# Estimate the parameters of the logit model\n",
+    "estr_redu = MEstimator(psi, init=[0., ]*3)\n",
+    "estr_redu.estimate()\n",
+    "\n",
+    "# Adding results to the output table\n",
+    "table.loc[len(table)] = [\"Reduced\", ] + list(estr_redu.theta) + [np.nan, ]*2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4c8ff25",
+   "metadata": {},
+   "source": [
+    "Finally, two alternative models are considered: a probit regression model and a complimentary log-log model. Again, these models are easily implemented using `ee_glm`. For the probit model, we set the link equal to `probit`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "24a66642",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def psi(theta):\n",
+    "    # Estimating equation for reduced probit model\n",
+    "    return ee_glm(theta=theta,\n",
+    "                  y=d['nausea'],\n",
+    "                  X=d[['intercept', 'auc', 'sex']],\n",
+    "                  distribution='binomial',\n",
+    "                  link='probit')\n",
+    "\n",
+    "\n",
+    "# Estimate the parameters of the probit model\n",
+    "estr_prob = MEstimator(psi, init=[0., ]*3)\n",
+    "estr_prob.estimate()\n",
+    "\n",
+    "# Adding results to the output table\n",
+    "table.loc[len(table)] = [\"Probit\", ] + list(estr_prob.theta) + [np.nan, ]*2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "643eb533",
+   "metadata": {},
+   "source": [
+    "Similarly, the complimentary log-log model only requires setting the link to `cloglog`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6858e649",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def psi(theta):\n",
+    "    # Estimating equation for reduced C-log-log model\n",
+    "    return ee_glm(theta=theta,\n",
+    "                  y=d['nausea'],\n",
+    "                  X=d[['intercept', 'auc', 'sex']],\n",
+    "                  distribution='binomial',\n",
+    "                  link='cloglog')\n",
+    "\n",
+    "\n",
+    "# Estimate the parameters of the cloglog model\n",
+    "estr_clog = MEstimator(psi, init=[0., ]*3)\n",
+    "estr_clog.estimate()\n",
+    "\n",
+    "# Adding results to the output table\n",
+    "table.loc[len(table)] = [\"Probit\", ] + list(estr_clog.theta) + [np.nan, ]*2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c28fe617",
+   "metadata": {},
+   "source": [
+    "Now we can view the results across the different models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d90c3a58",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Intercept</th>\n",
+       "      <th>AUC</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>PS</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Model</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Null</th>\n",
+       "      <td>-0.587787</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Full</th>\n",
+       "      <td>-5.602899</td>\n",
+       "      <td>0.289785</td>\n",
+       "      <td>1.730299</td>\n",
+       "      <td>0.049538</td>\n",
+       "      <td>0.220545</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Reduced</th>\n",
+       "      <td>-2.663522</td>\n",
+       "      <td>0.303502</td>\n",
+       "      <td>1.772238</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Probit</th>\n",
+       "      <td>-1.629729</td>\n",
+       "      <td>0.184030</td>\n",
+       "      <td>1.080253</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Probit</th>\n",
+       "      <td>-2.233720</td>\n",
+       "      <td>0.193108</td>\n",
+       "      <td>1.212290</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Intercept       AUC       Sex       Age        PS\n",
+       "Model                                                     \n",
+       "Null     -0.587787       NaN       NaN       NaN       NaN\n",
+       "Full     -5.602899  0.289785  1.730299  0.049538  0.220545\n",
+       "Reduced  -2.663522  0.303502  1.772238       NaN       NaN\n",
+       "Probit   -1.629729  0.184030  1.080253       NaN       NaN\n",
+       "Probit   -2.233720  0.193108  1.212290       NaN       NaN"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "table.set_index(\"Model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4791334a",
+   "metadata": {},
+   "source": [
+    "These point estimates match those reported in Tables 3 and 4 in the book (note the null model differs slightly, since we dropped the one observation with the missing AUC value to fit this model, but the book does not). These results highlight how `delicatessen` allows one to easily fit a variety of different models. \n",
+    "\n",
+    "### END\n",
+    "\n",
+    "This is the end of the current replication."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/Examples/index.rst b/docs/Examples/index.rst
index 2b5d788..dbb9a38 100644
--- a/docs/Examples/index.rst
+++ b/docs/Examples/index.rst
@@ -13,6 +13,7 @@ user-built estimating equations.
   LifeScienceExamples
   Cole-AJE-2023
   Boos-Stefanski-Ch7
+  Bonate-Pharmaco
   Hernan-Robins-2023
   Morris-Trials-2022
   Generalized-Additive-Model

From cf805ade925ecc9ad7a030ec75201d65a88d1158 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Tue, 23 Apr 2024 10:54:08 -0400
Subject: [PATCH 18/18] Updating all docs

---
 delicatessen/data.py                          |  28 +-
 delicatessen/estimating_equations/basic.py    |  41 +--
 delicatessen/estimating_equations/causal.py   | 319 ++++++++----------
 .../estimating_equations/dose_response.py     |  48 +--
 .../estimating_equations/measurement.py       |  26 +-
 .../estimating_equations/regression.py        | 307 ++++++++---------
 delicatessen/estimating_equations/survival.py |  63 ++--
 delicatessen/mestimation.py                   | 127 +++----
 delicatessen/sandwich.py                      |  68 ++--
 delicatessen/utilities.py                     |  74 ++--
 docs/Reference/Utilities.rst                  |   2 +-
 ...ssen.derivative.approx_differentiation.rst |   6 +
 12 files changed, 539 insertions(+), 570 deletions(-)
 create mode 100644 docs/Reference/generated/delicatessen.derivative.approx_differentiation.rst

diff --git a/delicatessen/data.py b/delicatessen/data.py
index 085f5fe..854ed25 100644
--- a/delicatessen/data.py
+++ b/delicatessen/data.py
@@ -14,7 +14,13 @@ def load_shaq_free_throws():
 
     Returns
     -------
-    ndarray
+    array :
+        Returns a 24-by-2 NumPy array.
+
+    References
+    ----------
+    Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
+    (pp. 297-337). Springer, New York, NY.
     """
     d = np.array([[ 1,  4,  5],
                   [ 2,  5, 11],
@@ -43,7 +49,7 @@ def load_shaq_free_throws():
 
 
 def load_inderjit():
-    """Load example data from Inderjit et al. (2002) on the dose-response of herbicide on perennial ryegrass growth
+    """Load example data from Inderjit et al. (2002) on the dose-response of herbicide on perennial ryegrass growth.
 
     Notes
     -----
@@ -53,7 +59,13 @@ def load_inderjit():
 
     Returns
     -------
-    ndarray
+    array :
+        Returns a 24-by-2 NumPy array.
+
+    References
+    ----------
+    Inderjit, Streibig JC, & Olofsdotter M. (2002). Joint action of phenolic acid mixtures and its significance in
+    allelopathy research. *Physiologia Plantarum*, 114(3), 422-428.
     """
     d = np.array([[7.5800000,  0.00],
                   [8.0000000,  0.00],
@@ -83,14 +95,22 @@ def load_inderjit():
 
 
 def load_robust_regress(outlier=True):
-    """Load illustrative example for robust linear regression.
+    """Load illustrative example of robust linear regression published in Zivich et al. (2022).
 
     Parameters
     ----------
+    outlier : bool, optional
+        Whether to induce the outlier (``True``) or not (``False``).
 
     Returns
     -------
+    array :
+        Returns a 15-by-2 NumPy array.
 
+    References
+    ----------
+    Zivich PN, Klose M, Cole SR, Edwards JK, & Shook-Sa BE. (2022). Delicatessen: M-estimation in Python.
+    *arXiv:2203.11300*.
     """
     height = [168.519, 166.944, 164.327, 164.058, 166.212, 167.358,
               165.244, 169.352, 159.386, 166.953, 163.876,
diff --git a/delicatessen/estimating_equations/basic.py b/delicatessen/estimating_equations/basic.py
index 7047c1e..a7e872f 100644
--- a/delicatessen/estimating_equations/basic.py
+++ b/delicatessen/estimating_equations/basic.py
@@ -20,11 +20,6 @@ def ee_mean(theta, y):
 
         \sum_{i=1}^n (Y_i - \theta) = 0
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
@@ -36,7 +31,7 @@ def ee_mean(theta, y):
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input ``theta`` and ``y``
+        Returns a 1-by-`n` NumPy array evaluated for the input ``theta`` and ``y``
 
     Examples
     --------
@@ -89,11 +84,6 @@ def ee_mean_robust(theta, y, k, loss='huber', lower=None, upper=None):
     Tukey's biweight, Andrew's Sine, and Hampel. See ``robust_loss_function`` for further details on the loss
     functions for the robust mean.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
@@ -105,18 +95,18 @@ def ee_mean_robust(theta, y, k, loss='huber', lower=None, upper=None):
         Tuning or hyperparameter for the chosen loss function. Notice that the choice of hyperparameter depends on the
         loss function.
     loss : str, optional
-        Robust loss function to use. Default is 'huber'. Options include 'andrew', 'hampel', 'huber', 'tukey'.
+        Robust loss function to use. Default is ``'huber'``. Options include ``'andrew'``, ``'hampel'``, ``'tukey'``.
     lower : int, float, None, optional
-        Lower parameter for the 'hampel' loss function. This parameter does not impact the other loss functions.
+        Lower parameter for the Hampel loss function. This parameter does not impact the other loss functions.
         Default is ``None``.
     upper : int, float, None, optional
-        Upper parameter for the 'hampel' loss function. This parameter does not impact the other loss functions.
+        Upper parameter for the Hampel loss function. This parameter does not impact the other loss functions.
         Default is ``None``.
 
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input theta and y
+        Returns a 1-by-`n` NumPy array evaluated for the input ``theta`` and ``y``.
 
     Examples
     --------
@@ -186,12 +176,6 @@ def ee_mean_variance(theta, y):
     Unlike ``ee_mean``, ``theta`` consists of 2 parameters. The output covariance matrix will also provide estimates
     for each of the ``theta`` values.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
-
     Parameters
     ----------
     theta : ndarray, list, vector
@@ -204,7 +188,7 @@ def ee_mean_variance(theta, y):
     Returns
     -------
     array :
-        Returns a 2-by-n NumPy array evaluated for the input theta and y
+        Returns a 2-by-`n` NumPy array evaluated for the input ``theta`` and ``y``.
 
     Examples
     --------
@@ -273,12 +257,12 @@ def ee_percentile(theta, y, q):
         1-dimensional vector of n observed values. No missing data should be included (missing data may cause unexpected
         behavior when attempting to calculate the mean).
     q : float
-        Percentile to calculate. Must be (0, 1)
+        Percentile to calculate. Must be :math:`(0, 1)`
 
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input theta and y
+        Returns a 1-by-`n` NumPy array evaluated for the input ``theta`` and ``y``.
 
     Examples
     --------
@@ -309,7 +293,7 @@ def ee_percentile(theta, y, q):
     >>> estr.theta
 
     Then displays the estimated percentile / median. In this example, there is a difference between the closed form
-    solution (-0.07978) and M-Estimation (-0.06022).
+    solution (``-0.07978``) and M-Estimation (``-0.06022``).
 
     References
     ----------
@@ -352,11 +336,6 @@ def ee_positive_mean_deviation(theta, y):
     sandwich) cannot be used to estimate the variance. This estimating equation is offered for completeness, but is not
     generally recommended for applications.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
@@ -369,7 +348,7 @@ def ee_positive_mean_deviation(theta, y):
     Returns
     -------
     array :
-        Returns a 2-by-n NumPy array evaluated for the input theta and y
+        Returns a 2-by-`n` NumPy array evaluated for the input ``theta`` and ``y``.
 
     Examples
     --------
diff --git a/delicatessen/estimating_equations/causal.py b/delicatessen/estimating_equations/causal.py
index 7e2fa63..ceced5e 100644
--- a/delicatessen/estimating_equations/causal.py
+++ b/delicatessen/estimating_equations/causal.py
@@ -13,37 +13,36 @@
 
 
 def ee_gformula(theta, y, X, X1, X0=None, force_continuous=False):
-    r"""Estimating equations for the g-computation. The parameter of interest can either be the mean under a single
-    policy or plan of action, or the mean difference between two policies. This is accomplished by providing the
-    estimating equation the observed data (``X``, ``y``), and the same data under the actions (``X1`` and optionally
-    ``X0``).
+    r"""Estimating equations for the g-formula (or g-computation). The parameter of interest can either be the mean
+    under a single policy or plan of action, or the mean difference between two policies. This is accomplished by
+    providing the estimating equation the observed data (``X``, ``y``), and the same data under the actions (``X1``
+    and optionally ``X0``).
 
-    The outcome regression estimating equation is
+    The stack of estimating equations are
 
     .. math::
 
-        \sum_{i=1}^n \left\{ Y_i - g(X_i^T \beta) \right\} X_i = 0
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            \left\{ g({X_i^*}^T \beta) - \theta_1 \right\} \\
+            \left\{ Y_i - g(X_i^T \beta) \right\} X_i
+        \end{bmatrix}
+        = 0
 
-    where :math:`g` indicates a transformation function. For linear regression, :math:`g` is the identity function.
-    Logistic regression uses the inverse-logit function. By default, `ee_gformula` detects whether `y` is all binary
+    where the first is the mean under the specified plan, with the plan setting the values of action :math:`A` (e.g.,
+    exposure, treatment, vaccination, etc.), and the second equation is the outcome regression model.
+    Here, :math:`g` indicates a transformation function. For linear regression, :math:`g` is the identity function.
+    Logistic regression uses the inverse-logit function. By default, ``ee_gformula`` detects whether `y` is all binary
     (zero or one), and applies logistic regression if that is evaluated to be true.
 
-    There are two variations on the parameter of interest. The first could be the mean under a plan, where the plan sets
-    the values of action :math:`A` (e.g., exposure, treatment, vaccination, etc.). The estimating equation for this
-    causal mean is
-
-    .. math::
-
-        \sum_{i=1}^n \left\{ g({X_i^*}^T \beta) - \theta_1 \right\} = 0
-
     Note
     ----
-    This variation includes :math:`1+b` parameters, where the first parameter is the causal mean, and the remainder are
+    This variation includes 1+`b` parameters, where the first parameter is the causal mean, and the remainder are
     the parameters for the regression model.
 
 
-    The alternative parameter of interest could be the mean difference between two plans. A common example of this would
-    be the average causal effect, where the plans are all-action-one versus all-action-zero. Therefore, the estimating
+    Alternatively, a causal mean difference is estimated when ``X0`` is specified. A common example of this would be
+    the average causal effect, where the plans are all-action-one versus all-action-zero. Therefore, the estimating
     equations consist of the following three equations
 
     .. math::
@@ -52,46 +51,38 @@ def ee_gformula(theta, y, X, X1, X0=None, force_continuous=False):
         \begin{bmatrix}
             (\theta_1 - \theta_2) - \theta_0 \\
             g({X_i^1}^T \beta) - \theta_1 \\
-            g({X_i^0}^T \beta) - \theta_2
+            g({X_i^0}^T \beta) - \theta_2 \\
+            \left\{ Y_i - g(X_i^T \beta) \right\} X_i
         \end{bmatrix}
         = 0
 
     Note
     ----
-    This variation includes :math:`3+b` parameters, where the first parameter is the causal mean difference, the second
+    This variation includes 3+`b` parameters, where the first parameter is the causal mean difference, the second
     is the causal mean under plan 1, the third is the causal mean under plan 0, and the remainder are the parameters
     for the regression model.
 
-
-    The parameter of interest is designated by the user via whether the optional argument ``X0`` is left as ``None``
-    (which estimates the causal mean) or is given an array (which estimates the causal mean difference and the
-    corresponding causal means).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 1+b values if ``X0`` is ``None``, and 3+b values if ``X0`` is not ``None``.
+        Theta consists of 1+`b` values if ``X0`` is ``None``, and 3+`b` values if ``X0`` is not ``None``.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     X1 : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables under the action plan.
+        2-dimensional vector of `n` observed values for `b` variables under the action plan.
     X0 : ndarray, list, vector, None, optional
-        2-dimensional vector of n observed values for b variables under the separate action plan. This second argument
-        is optional and should be specified if the causal mean difference between two action plans is of interest.
+        2-dimensional vector of `n` observed values for `b` variables under the separate action plan. This second
+        argument is optional and should be specified if the causal mean difference between two action plans is of
+        interest.
     force_continuous : bool, optional
         Option to force the use of linear regression despite detection of a binary variable.
 
     Returns
     -------
     array :
-        Returns a (1+b)-by-n NumPy array if ``X0=None``, or returns a (3+b)-by-n NumPy array if ``X0!=None``
+        Returns a (1+`b`)-by-`n` NumPy array if ``X0=None``, or returns a (3+`b`)-by-`n` NumPy array if ``X0!=None``
 
     Examples
     --------
@@ -248,18 +239,10 @@ def ee_gformula(theta, y, X, X1, X0=None, force_continuous=False):
 
 
 def ee_ipw(theta, y, A, W, truncate=None, weights=None):
-    r"""Estimating equation for inverse probability weighting estimator. For estimation of the weights (or propensity
-    scores), a logistic model is used. The first estimating equations for the logistic regression model are
+    r"""Estimating equation for inverse probability weighting (IPW) estimator. The average causal effect is estimated by
+    this implementation of the IPW estimator. For estimation of the propensity scores, a logistic model is used.
 
-    .. math::
-
-        \sum_{i=1}^n \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i = 0
-
-    where A is the action and W is the set of confounders.
-
-    For the implementation of the inverse probability weighting estimator, stacked estimating equations are used
-    for the mean had everyone been set to ``A=1``, the mean had everyone been set to ``A=0``, and the mean difference
-    between the two causal means. The estimating equations are
+    The stacked estimating equations are
 
     .. math::
 
@@ -267,52 +250,41 @@ def ee_ipw(theta, y, A, W, truncate=None, weights=None):
         \begin{bmatrix}
             (\theta_1 - \theta_2) - \theta_0 \\
             \frac{A_i Y_i}{\pi_i} - \theta_1 - \theta_1 \\
-            \frac{(1-A_i) Y_i}{1-\pi_i} - \theta_2
+            \frac{(1-A_i) Y_i}{1-\pi_i} - \theta_2 \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i
         \end{bmatrix}
         = 0
 
-    where :math:`\pi_i = expit(W_i^T \alpha)`. Due to these 3 extra values, the length of the theta vector is 3+b,
-    where b is the number of parameters in the regression model.
-
-    Note
-    ----
-    Unlike ``ee_gformula``, ``ee_ipw`` always provides the average causal effect, and causal means for ``A=1`` and
-    ``A=0``.
-
-
-    Here, theta corresponds to a variety of different quantities. The *first* value in theta vector is the causal mean
-    difference, the *second* is the mean had everyone been set to ``A=1``, the *third* is the mean had everyone been
-    set to ``A=0``. The remainder of the parameters correspond to the logistic regression model coefficients.
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
+    where :math:`A` is the action, math:`W` is the set of confounders, and :math:`\pi_i = expit(W_i^T \alpha)`. The
+    first estimating equation is for the average causal effect, the second is for the mean under :math:`A:=1`,
+    the third is for the mean under :math:`A:=0`, and the last is the logistic regression model for the propensity
+    scores. Here, the length of the theta vector is 3+`b`, where `b` is the number of parameters in the regression
+    model.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 3+b values.
+        Theta consists of 3+`b` values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     A : ndarray, list, vector
-        1-dimensional vector of n observed values. The A values should all be 0 or 1.
+        1-dimensional vector of `n` observed values. The A values should all be 0 or 1.
     W : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables to model the probability of ``A`` with.
+        2-dimensional vector of `n` observed values for `b` variables to model the probability of ``A`` with.
     truncate : None, list, set, ndarray, optional
         Bounds to truncate the estimated probabilities of ``A`` at. For example, estimated probabilities above 0.99 or
         below 0.01 can be set to 0.99 or 0.01, respectively. This is done by specifying ``truncate=(0.01, 0.99)``. Note
         this step is done via ``numpy.clip(.., a_min, a_max)``, so order is important. Default
         is ``None``, which applies no truncation.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations. This
+        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations. This
         argument is intended to support the use of missingness weights. The propensity score model is *not* fit using
         these weights.
 
     Returns
     -------
     array :
-        Returns a (3+b)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (3+`b`)-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -334,7 +306,7 @@ def ee_ipw(theta, y, A, W, truncate=None, weights=None):
     >>> d['Y'] = (1-d['A'])*d['Ya0'] + d['A']*d['Ya1']
     >>> d['C'] = 1
 
-    Defining psi, or the stacked estimating equations. Note that 'A' is the action.
+    Defining psi, or the stacked estimating equations. Note that ``'A'`` is the action.
 
     >>> def psi(theta):
     >>>     return ee_ipw(theta, y=d['Y'], A=d['A'],
@@ -409,39 +381,42 @@ def ee_ipw(theta, y, A, W, truncate=None, weights=None):
 
 
 def ee_ipw_msm(theta, y, A, W, V, distribution, link, hyperparameter=None, truncate=None, weights=None):
-    r"""Estimating equation for inverse probability weighting estimator of the parameters of a marginal structural
-    model. For estimation of the weights (or propensity scores), a logistic model is used. The first estimating
-    equations for the logistic regression model are
+    r"""Estimating equation for parameters of a marginal structural model estimated using inverse probability weighting.
+    For estimation of the propensity scores, a logistic model is used.
+
+    The stacked estimating equations are
 
     .. math::
 
-        \sum_{i=1}^n \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i = 0
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            \frac{1}{\pi_i} \left\{ Y_i - g^{-1}(X_i^T \beta) \right\} \times \frac{D(\beta)}{v(\beta)} X_i \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i
+        \end{bmatrix}
+        = 0
 
-    where A is the action and W is the set of confounders. For the implementation of the inverse probability weighting
-    estimator of the marginal structural model, a weighted generalized linear model is used. See ``ee_glm`` for details
-    on this estimating equation.
+    where :math:`A` is the action, math:`W` is the set of confounders, and :math:`\pi_i = \text{expit}(W_i^T \alpha)`.
+    Here, :math:`X` is the design matrix for the marginal structural model (it includes :math:`A`, and possibly some
+    covariates from :math:`W`). The first estimating equation is a weighted generalized linear model is used. See
+    ``ee_glm`` for details on this estimating equation. The second estimating equation is the logistic model for the
+    propensity scores.
 
     Here, ``theta`` corresponds to multiple quantities. The *first* set of values correspond to the parameters of the
     marginal structural model, and the *second* set correspond to the logistic regression model coefficients for the
     propensity scores.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 3+b values.
+        Theta consists of `c`+`b` values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     A : ndarray, list, vector
-        1-dimensional vector of n observed values. The A values should all be 0 or 1.
+        1-dimensional vector of `n` observed values. The A values should all be 0 or 1.
     W : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables to model the probability of ``A`` with.
+        2-dimensional vector of `n` observed values for `b` variables to model the probability of ``A`` with.
     V : ndarray, list, vector
-        2-dimensional vector of n observed values for c variables in the marginal structural model.
+        2-dimensional vector of `n` observed values for `c` variables in the marginal structural model.
     distribution : str
         Distribution for the generalized linear model. See ``ee_glm`` for options.
     link : str
@@ -459,7 +434,7 @@ def ee_ipw_msm(theta, y, A, W, V, distribution, link, hyperparameter=None, trunc
     Returns
     -------
     array :
-        Returns a (3+b)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (`c`+`b`)-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -558,26 +533,9 @@ def ee_ipw_msm(theta, y, A, W, V, distribution, link, hyperparameter=None, trunc
 
 def ee_aipw(theta, y, A, W, X, X1, X0, truncate=None, force_continuous=False):
     r"""Estimating equation for augmented inverse probability weighting (AIPW) estimator. AIPW consists of two nuisance
-    models (the propensity score model and the outcome model). For estimation of the propensity scores, the estimating
-    equations are
+    models (the propensity score model and the outcome model).
 
-    .. math::
-
-        \sum_{i=1}^n \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i = 0
-
-    where ``A`` is the treatment and ``W`` is the set of confounders. The estimating equations for the outcome model
-    are
-
-    .. math::
-
-        \sum_{i=1}^n \left\{ Y_i - g(X_i^T \beta) \right\} X_i = 0
-
-    By default, `ee_aipw` detects whether `y` is all binary (zero or one), and applies logistic regression. Notice that
-    ``X`` here should consists of both ``A`` and ``W`` (with possible interaction terms or other differences in
-    functional forms from the propensity score model).
-
-    The AIPW estimating equations include the causal mean difference, mean had everyone been set to ``A=1``, and the
-    mean had everyone been set to ``A=0``
+    The stacked estimating equations are
 
     .. math::
 
@@ -585,46 +543,39 @@ def ee_aipw(theta, y, A, W, X, X1, X0, truncate=None, force_continuous=False):
         \begin{bmatrix}
             (\theta_1 - \theta_2) - \theta_0 \\
             \frac{A_i Y_i}{\pi_i} - \frac{\hat{Y^1}(A_i-\pi_i}{\pi_i} - \theta_1 \\
-            \frac{(1-A_i) Y_i}{1-\pi_i} + \frac{\hat{Y^0}(A_i-\pi_i}{1-\pi_i} - \theta_2
+            \frac{(1-A_i) Y_i}{1-\pi_i} + \frac{\hat{Y^0}(A_i-\pi_i}{1-\pi_i} - \theta_2 \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i \\
+            \left\{ Y_i - g(X_i^T \beta) \right\} X_i
         \end{bmatrix}
         = 0
 
-    where :math:`\hat{Y}^a = g({X_i^*}^T \beta)`.
-
-    Note
-    ----
-    Unlike ``ee_gformula``, ``ee_aipw`` always provides the average causal effect, and causal means for ``A=1`` and
-    ``A=0``.
-
+    where :math:`A` is the action and :math:`W` is the set of confounders, :math:`Y` is the outcome, and
+    :math:`\pi_i = \text{expit}(W_i^T \alpha)`. The first estimating equation is for the average causal effect, the
+    second is for the mean under :math:`A:=1`, the third is for the mean under :math:`A:=0`, the fourth is the logistic
+    regression model for the propensity scores, and the last is for the outcome model. Here, the length of the theta
+    vector is 3+`b`+`c`, where `b` is the number of parameters in the propensity score model and `c` is the number
+    of parameters in the outcome model.
 
-    Due to these 3 extra values and two nuisance models, the length of the parameter vector is 3+b+c, where b is the
-    number of columns in ``W``, and c is the number of columns in ``X``. The *first* value in theta vector is the
-    causal mean difference (or average causal effect), the *second* is the mean had everyone been given ``A=1``, the
-    *third* is the mean had everyone been given ``A=0``. The remainder of the parameters correspond to the regression
-    model coefficients, in the order input. The first 'chunk' of  coefficients correspond to the propensity score model
-    and the last 'chunk' correspond to the outcome model.
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
+    By default, `ee_aipw` detects whether `y` is all binary (zero or one), and applies logistic regression. Notice that
+    ``X`` here should consists of both ``A`` and ``W`` (with possible interaction terms or other differences in
+    functional forms from the propensity score model).
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 3+b+c values.
+        Theta consists of 3+`b`+`c` values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     A : ndarray, list, vector
-        1-dimensional vector of n observed values. The A values should all be 0 or 1.
+        1-dimensional vector of `n` observed values. The A values should all be 0 or 1.
     W : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables to model the probability of ``A`` with.
+        2-dimensional vector of `n` observed values for `b` variables to model the probability of ``A`` with.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for c variables to model the outcome ``y``.
+        2-dimensional vector of `n` observed values for `c` variables to model the outcome ``y``.
     X1 : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables under the action plan where ``A=1`` for all units.
+        2-dimensional vector of `n` observed values for `c` variables under the action plan where ``A=1`` for all units.
     X0 : ndarray, list, vector, None, optional
-        2-dimensional vector of n observed values for b variables under the action plan where ``A=0`` for all units.
+        2-dimensional vector of `n` observed values for `c` variables under the action plan where ``A=0`` for all units.
     truncate : None, list, set, ndarray, optional
         Bounds to truncate the estimated probabilities of ``A`` at. For example, estimated probabilities above 0.99 or
         below 0.01 can be set to 0.99 or 0.01, respectively. This is done by specifying ``truncate=(0.01, 0.99)``. Note
@@ -636,7 +587,7 @@ def ee_aipw(theta, y, A, W, X, X1, X0, truncate=None, force_continuous=False):
     Returns
     -------
     array :
-        Returns a (3+b+c)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (3+`b`+`c`)-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -787,15 +738,15 @@ def ee_gestimation_snmm(theta, y, A, W, V, X=None, model='linear', weights=None)
 
         E[Y^a - Y^{0} | A=a, V] = \beta_1 a + \beta_2 a V
 
-    This model corresponds to the average causal effect among those with :math:`A=a` within strata of :math:`V`. The
+    This model corresponds to the average causal effect among those with :math:`A=a` by :math:`V`. The
     log-linear SMM is defined as
 
     .. math::
 
         \frac{E[Y^a | A=a, V]}{E[Y^{0} | A=a, V]} = \exp(\beta_1 a + \beta_2 a V)
 
-    This model corresponds to the causal mean ratio among those with :math:`A=a` within strata of :math:`V`. Note that
-    the log-linear SMM is only defined when :math:`Y > 0`. The parameters of either SMM can be identified under the
+    This model corresponds to the causal mean ratio among those with :math:`A=a` by :math:`V`. Note that
+    the log-linear SMM is only defined when :math:`Y > 0`. The parameters of either SMM are identified under the
     assumptions of  causal consistency, and exchangeability with positivity.
 
     Two different estimating equations are available for g-estimation. The first set is referred to at the 'inefficient'
@@ -803,57 +754,68 @@ def ee_gestimation_snmm(theta, y, A, W, V, X=None, model='linear', weights=None)
 
     .. math::
 
-        \sum_{i=1}^n \left\{ H(\beta) \times (A - E[A | W]) \right\}  \times \mathbb{V}_i = 0
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            \left\{ H(\beta) \times (A - \pi_i) \right\}  \times V_i \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i
+        \end{bmatrix}
+        = 0
 
-    where :math:`H(\beta) = Y - \beta A \mathbb{V}` for a linear SMM and
-    :math:`H(\beta) = Y \times \exp(-A \beta \mathbb{V})` for a log-linear SMM, where :math:`\mathbb{V}` is a design
-    matrix. Note that :math:`V \subseteq W`, where :math:`W` is the set of confounding variables. This estimating
-    equation requires :math:`E[A|W]`, which must be estimated. This is done via the following estimating equation for
-    binary actions
+    where :math:`\pi_i = \text{expit}(W_i^T \alpha)`, and
+    :math:`H(\beta) = Y - \beta A \mathbb{V}` for a linear SMM and
+    :math:`H(\beta) = Y \times \exp(-A \beta \mathbb{V})` for a log-linear SMM, where .
+    Note that :math:`V \subseteq W`, where :math:`W` is the set of confounding variables.
+    The length of the parameter vector is `b`+`c`, where `b` is the number of columns in ``V``, and
+    `c` is the number of columns in ``W``.
 
-    .. math::
+    The second implementation for g-estimation is the 'efficient' g-estimator. For the efficient g-estimator we replace
+    :math:`H(\beta)` with :math:`\{H(\beta) - E[H(\beta) | W]\}` in the prior estimating equation and specify a model
+    for :math:`E[H(\beta) | W]`. The corresponding stacked estimating equations are
 
-        \sum_{i=1}^n \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i = 0
+    .. math::
 
-    These estimating equations are stacked together. Therefore, the length of the parameter vector is b+c, where b is
-    the number of columns in ``V``, and c is the number of columns in ``W``. The *first* b values in theta
-    vector are the SMM parameters. The *second* set are the parameters corresponding to the :math:`E[A|W]` model.
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            \left\{ (H(\beta) - g^{-1}(W_i^T \gamma)) \times (A - \pi_i) \right\}  \times V_i \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i \\
+            \left\{ H(\beta) - g^{-1}(W_i^T \gamma) \right\} W_i \\
+        \end{bmatrix}
+        = 0
 
-    The second implementation for g-estimation is the 'efficient' g-estimator. For the efficient g-estimator we replace
-    :math:`H(\beta)` with :math:`\{H(\beta) - E[H(\beta) | W]\}` in the prior estimating equation. Here, we also need to
-    specify a model for :math:`E[H(\beta) | W]`. Therefore, an additional estimating equation for
-    :math:`E[H(\beta) | W]` is stacked with the others. Therefore, there are b+c+d parameters for the efficient
-    g-estimator, where d is the number of parameters in the model for :math:`E[H(\beta) | W]`.
+    where :math:`g^{-1}` is the inverse transformation for the specified SMM. Therefore, there are b+c+d parameters
+    for the efficient g-estimator, where `d` is the number of parameters in the model for :math:`E[H(\beta) | W]`.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 1+b values if ``X0`` is ``None``, and 3+b values if ``X0`` is not ``None``.
+        Theta consists of 1+`b` values if ``X0`` is ``None``, and 3+b values if ``X0`` is not ``None``.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values of the outcome.
+        1-dimensional vector of `n` observed values of the outcome.
     A : ndarray, list, vector
-        1-dimensional vector of n observed values of the action. The A values should all be 0 or 1.
+        1-dimensional vector of `n` observed values of the action. The A values should all be 0 or 1.
     W : ndarray, list, vector
-        2-dimensional vector of n observed values for b columns of a design matrix to model the expected value of ``A``.
+        2-dimensional vector of `n` observed values for b columns of a design matrix to model the expected value of
+        ``A``.
     V : ndarray, list, vector
-        2-dimensional vector of n observed values for b columns of a design matrix for the structural mean model. Note
-        that the design matrix here is expected to not include the observed values of ``A``
+        2-dimensional vector of `n` observed values for `b` columns of a design matrix for the structural mean model.
+        Note that the design matrix here is expected to not include the observed values of ``A``
     X : ndarray, list, vector, None, optional
         Default of this argument is ``None``, which implements the estimating equation for the inefficient g-estimator.
-        To use the efficient g-estimator, a 2-dimensional vector of n observed values for b columns of a design matrix
+        To use the efficient g-estimator, a 2-dimensional vector of n observed values for `b` columns of a design matrix
         for the :math:`E[H(\beta) | W]` model should be provided here.
     model : str, optional
         Type of structural mean model to fit. Options are currently: ``linear``, ``poisson``. Default is ``linear``.
         The Poisson model specification can be used for positive continuous data, or with binary data in order to
         estimate causal risk ratios.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations. This
+        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations. This
         argument is intended to support the use of sampling or missingness weights.
 
     Returns
     -------
     array :
-        Returns a (b+c)-by-n (inefficient) or (b+c+d)-by-n (efficient) NumPy array evaluated for the input ``theta``
+        Returns a (`b`+`c`)-by-`n` (inefficient) or (`b`+`c`+`d`)-by-`n` (efficient) NumPy array evaluated for the
+        input ``theta``.
 
     Examples
     --------
@@ -1040,34 +1002,35 @@ def ee_mean_sensitivity_analysis(theta, y, delta, X, q_eval, H_function):
     probability weighting estimator will result in different (but similar) estimates.
 
 
-    The length of the parameter vector, :math:`\theta`, is 1+b, where b is the number of columns in ``X``. The *first*
-    value in the theta vector is the corrected mean of :math:`Y`. The remainder of the parameters correspond to the
-    regression model coefficients.
+    The length of the parameter vector, :math:`\theta`, is 1+`b`, where `b` is the number of columns in ``X``.
+    The *first* value in the theta vector is the corrected mean of :math:`Y`. The remainder of the parameters
+    correspond to the regression model coefficients.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of 1+b values. Therefore, initial values should consist of one plus the number of
+        Theta in this case consists of 1+`b` values. Therefore, initial values should consist of one plus the number of
         columns present in ``X``. This can easily be accomplished generally by ``[0, ] + [0, ] * X.shape[1]``.
     y : ndarray, list, vector
-        1-dimensional vector of n values. Any values of ``y`` that are missing should be indicated by the ``delta``
+        1-dimensional vector of `n` values. Any values of ``y`` that are missing should be indicated by the ``delta``
         parameter.
     delta : ndarray, list, vector
-        1-dimensional vector of n observed values indicating whether the observation has a value for ``y`` observed,
+        1-dimensional vector of `n` observed values indicating whether the observation has a value for ``y`` observed,
         where 1 indicates yes and 0 indicated no. This vector should not include any ``nan`` values.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables consider as predictors. At a minimum, a vector of ones
-        (intercept) should be included. This matrix should not include any ``nan`` values.
+        2-dimensional vector of `n` observed values for `b` variables consider as predictors. At a minimum, a vector
+        of ones (intercept) should be included. This matrix cannot include any ``nan`` values.
     q_eval : ndarray, list, vector
-        1-dimensional vector of n values evaluated using the :math:`q(Y; \alpha)` function.
+        1-dimensional vector of `n` values evaluated using the :math:`q(Y; \alpha)` function.
     H_function : callable
-        Function use to bound the observations between 0,1. The function must be monotonic increasing and be bounded by
-        :math:`[0,1]`. For example, the expit (``delicatessen.utilities.inverse_logit``) function meets this criteria.
+        Function use to bound the observations between :math:`[0,1]`. The function must be monotonic increasing and be
+        bounded by :math:`[0,1]`. For example, the expit (``delicatessen.utilities.inverse_logit``) function meets
+        this criteria.
 
     Returns
     -------
     array :
-        Returns a (1+b)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (1+`b`)-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
diff --git a/delicatessen/estimating_equations/dose_response.py b/delicatessen/estimating_equations/dose_response.py
index 405efa5..7dc2839 100644
--- a/delicatessen/estimating_equations/dose_response.py
+++ b/delicatessen/estimating_equations/dose_response.py
@@ -26,28 +26,24 @@ def ee_4p_logistic(theta, X, y):
     :math:`\rho = \frac{D_i}{\theta_1}^{\theta_2}`, and
     :math:`\hat{Y_i} = \theta_0 + \frac{\theta_3 - \theta_0}{1+\rho}`.
 
-    Here, theta is a 1-by-4 array, where 4 are the 4 parameters of the 4PL. The first theta corresponds to lower limit
+    Here, theta is a 1-by-4 array. The first theta corresponds to lower limit
     (:math:`\theta_0`), the second corresponds to the effective dose (ED50) (:math:`\theta_1`), the third corresponds
     to the steepness of the curve (:math:`\theta_2`), and the fourth corresponds to the upper limit (:math:`\theta_3`).
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of 4 values. In general, starting values ``>0`` are better choices for the 4PL model
+        Theta in this case consists of 4 values. In general, starting values :math:`>0` are better choices for the
+        4PL model
     X : ndarray, list, vector
-        1-dimensional vector of n dose values.
+        1-dimensional vector of `n` dose values.
     y : ndarray, list, vector
-        1-dimensional vector of n response values.
+        1-dimensional vector of `n` response values.
 
     Returns
     -------
     array :
-        Returns a 4-by-n NumPy array evaluated for the input ``theta``
+        Returns a 4-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -162,26 +158,22 @@ def ee_3p_logistic(theta, X, y, lower):
     corresponds to the upper limit (:math:`\theta_3`). The lower limit (:math:`\theta_0`, ``lower``) is pre-specified
     by the user (and is no longer estimated)
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of 3 values. In general, starting values ``>0`` are better choices for the 3PL model
+        Theta in this case consists of 3 values. In general, starting values :math:`>0` are better choices for the
+        3PL model
     X : ndarray, list, vector
-        1-dimensional vector of n dose values.
+        1-dimensional vector of `n` dose values.
     y : ndarray, list, vector
-        1-dimensional vector of n response values.
+        1-dimensional vector of `n` response values.
     lower : int, float
         Set value for the lower limit.
 
     Returns
     -------
     array :
-        Returns a 3-by-n NumPy array evaluated for the input theta, y, X
+        Returns a 3-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -260,19 +252,15 @@ def ee_2p_logistic(theta, X, y, lower, upper):
     (:math:`\theta_0`, ``lower``) and upper limit (:math:`\theta_3`, ``upper``) are pre-specified by the user (and are
     no longer estimated)
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of 2 values. In general, starting values >0 are better choices for the 3PL model
+        Theta in this case consists of 2 values. In general, starting values :math:`>0` are better choices for the
+        2PL model
     X : ndarray, list, vector
-        1-dimensional vector of n dose values.
+        1-dimensional vector of `n` dose values.
     y : ndarray, list, vector
-        1-dimensional vector of n response values.
+        1-dimensional vector of `n` response values.
     lower : int, float
         Set value for the lower limit.
     upper : int, float
@@ -281,7 +269,7 @@ def ee_2p_logistic(theta, X, y, lower, upper):
     Returns
     -------
     array :
-        Returns a 2-by-n NumPy array evaluated for the input theta, y, X
+        Returns a 2-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -356,7 +344,7 @@ def ee_effective_dose_delta(theta, y, delta, steepness, ed50, lower, upper):
     theta : int, float
         Theta value corresponding to the ED(alpha).
     y : ndarray, list, vector
-        1-dimensional vector of n response values, used to construct correct shape for output.
+        1-dimensional vector of `n` response values, used to construct correct shape for output.
     delta : float
         The effective dose level of interest, ED(alpha).
     steepness : float
@@ -373,7 +361,7 @@ def ee_effective_dose_delta(theta, y, delta, steepness, ed50, lower, upper):
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input theta
+        Returns a 1-by-`n` NumPy array evaluated for the input theta
 
     Examples
     --------
diff --git a/delicatessen/estimating_equations/measurement.py b/delicatessen/estimating_equations/measurement.py
index 51618da..2bf0610 100644
--- a/delicatessen/estimating_equations/measurement.py
+++ b/delicatessen/estimating_equations/measurement.py
@@ -43,21 +43,21 @@ def ee_rogan_gladen(theta, y, y_star, r, weights=None):
     theta : ndarray, list, vector
         Theta consists of 4 values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values. These are the gold-standard :math:`Y` measurements in the external
+        1-dimensional vector of `n` observed values. These are the gold-standard :math:`Y` measurements in the external
         sample. All values should be either 0 or 1, and be non-missing among those with :math:`R=0`.
     y_star : ndarray, list, vector
-        1-dimensional vector of n observed values. These are the mismeasured :math:`Y` values. All values should be
+        1-dimensional vector of `n` observed values. These are the mismeasured :math:`Y` values. All values should be
         either 0 or 1, and be non-missing among all observations.
     r : ndarray, list, vector
-        1-dimensional vector of n indicators regarding whether an observation was part of the external validation data.
-        Indicator should designate if observations are the main data.
+        1-dimensional vector of `n` indicators regarding whether an observation was part of the external validation
+        data. Indicator should designate if observations are the main data.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
 
     Returns
     -------
     array :
-        Returns a 4-by-n NumPy array evaluated for the input ``theta``
+        Returns a 4-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -164,7 +164,7 @@ def ee_rogan_gladen_extended(theta, y, y_star, r, X, weights=None):
     where :math:`Y` is the true value of the outcome, :math:`Y^*` is the mismeasured value of the outcome. The first
     estimating equation is the corrected proportion, the second is for sensitivity, and the third for specificity.
 
-    If :math:`X` is of dimension :math:`p`, then ``theta`` is a 1-by-(1+2p) array. Note that the design matrix is
+    If :math:`X` is of dimension :math:`p`, then ``theta`` is a 1-by-(1+2`p`) array. Note that the design matrix is
     shared across the sensitivity and specificity models.
 
     Note
@@ -177,23 +177,23 @@ def ee_rogan_gladen_extended(theta, y, y_star, r, X, weights=None):
     theta : ndarray, list, vector
         Theta consists of 4 values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values. These are the gold-standard :math:`Y` measurements in the external
+        1-dimensional vector of `n` observed values. These are the gold-standard :math:`Y` measurements in the external
         sample. All values should be either 0 or 1, and be non-missing among those with :math:`R=0`.
     y_star : ndarray, list, vector
-        1-dimensional vector of n observed values. These are the mismeasured :math:`Y` values. All values should be
+        1-dimensional vector of `n` observed values. These are the mismeasured :math:`Y` values. All values should be
         either 0 or 1, and be non-missing among all observations.
     r : ndarray, list, vector
-        1-dimensional vector of n indicators regarding whether an observation was part of the external validation data.
-        Indicator should designate if observations are the main data.
+        1-dimensional vector of `n` indicators regarding whether an observation was part of the external validation
+        data. Indicator should designate if observations are the main data.
     X : ndarray, list, vector
         2-dimensional vector of a design matrix for the sensitivity and specificity models.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
 
     Returns
     -------
     array :
-        Returns a 4-by-n NumPy array evaluated for the input ``theta``
+        Returns a 4-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
diff --git a/delicatessen/estimating_equations/regression.py b/delicatessen/estimating_equations/regression.py
index 11a4e05..61969ef 100644
--- a/delicatessen/estimating_equations/regression.py
+++ b/delicatessen/estimating_equations/regression.py
@@ -27,39 +27,31 @@ def ee_regression(theta, X, y, model, weights=None, offset=None):
     Logistic regression uses the inverse-logit function, :math:`\text{expit}(u) = 1 / (1 + \exp(u))`. Finally, Poisson
     regression is :math:`\exp(u)`.
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughout, these
-    user-defined functions are defined as ``psi``.
-
-
-    Here, :math:`\theta` corresponds to the coefficients in the corresponding regression model
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented by ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -145,7 +137,7 @@ def ee_glm(theta, X, y, distribution, link, hyperparameter=None, weights=None, o
 
     .. math::
 
-        \sum_{i=1}^n W_i \left\{ Y_i - g^{-1}(X_i^T \theta) \times \frac{D(\theta)}{v(\theta)} \right\} X_i = 0
+        \sum_{i=1}^n \left\{ Y_i - g^{-1}(X_i^T \theta) \right\} \times \frac{D(\theta)}{v(\theta)} X_i = 0
 
     where :math:`g` is the link function, :math:`g^{-1}` is the inverse link function, :math:`D(\theta)` is the
     derivative of the inverse link function by :math:`\theta`, and :math:`v(\theta)` is the variance function for the
@@ -157,27 +149,19 @@ def ee_glm(theta, X, y, distribution, link, hyperparameter=None, weights=None, o
     additional parameter-specific estimating equations.
 
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughout, these
-    user-defined functions are defined as ``psi``.
-
-
-    Here, :math:`\theta` corresponds to the coefficients in the corresponding regression model
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented by ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     distribution : str
         Distribution for the generalized linear model. Options are:
         ``'normal'`` (alias: ``gaussian``),
@@ -199,22 +183,22 @@ def ee_glm(theta, X, y, distribution, link, hyperparameter=None, weights=None, o
         ``inverse``,
         and ``square_root`` (alias: ``sqrt``).
     hyperparameter : None, int, float
-        Hyperparameter specification. Default is None. This option is only used by the tweedie distribution. It is
+        Hyperparameter specification. Default is ``None``. This option is only used by the tweedie distribution. It is
         ignored by all other distributions.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Note
     ----
     Link and distribution combinations are not checked for their validity. Some pairings may not converge or may
-    produce nonsensical results. Please check the combination you are using is valid.
+    produce nonsensical results. Please check the distribution-link combination you are using is valid.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -285,19 +269,19 @@ def ee_glm(theta, X, y, distribution, link, hyperparameter=None, weights=None, o
     >>> estr = MEstimator(stacked_equations=psi, init=[0., 0., 0., 0.])
     >>> estr.estimate()
 
-    Note that ``delicatessen`` appropriately incorporates the estimation of the additional parameter for the
+    Note that delicatessen appropriately incorporates the estimation of the additional parameter for the
     negative-binomial and gamma distributions. This is unlike some statistical software that estimates this parameter
     but does *not* incorporate the uncertainty in estimation of that parameter. This may explain differences you
-    encounter across software (and the ``delicatessen`` implementation is preferred, as it is a more honest expression
-    of the uncertainty).
+    encounter across software (and the delicatessen implementation is to be preferred, as it is a more honest
+    expression of the uncertainty).
 
     Finally, the tweedie distribution for GLM is a generalization of the Poisson and gamma distributions. Unlike the
-    negative-binomial and gamma distributions, there is a fixed (i.e., not estimated) hyperparameter bounded to be >0.
-    When the tweedie distribution hyperparameter is set to 1, it is equivalent to the Poisson distribution. When the
-    tweedie distribution hyperparameter is set to 2, it is equivalent to the gamma distribution. When the tweedie
-    distribution hyperparameter is set to 3, it is equivalent to the inverse-normal distribution. However, the tweedie
-    distribution hyperparameter can be specified for any values. Here, we illustrate the tweedie distribution that is
-    between a Poisson and gamma distribution.
+    negative-binomial and gamma distributions, there is a fixed (i.e., not estimated) hyperparameter bounded to be
+    :math:`>0`. When the tweedie distribution hyperparameter is set to 1, it is equivalent to the Poisson distribution.
+    When the tweedie distribution hyperparameter is set to 2, it is equivalent to the gamma distribution. When the
+    tweedie distribution hyperparameter is set to 3, it is equivalent to the inverse-normal distribution. However, the
+    tweedie distribution hyperparameter can be specified for any values. Here, we illustrate the tweedie distribution
+    that is between a Poisson and gamma distribution.
 
     >>> def psi(theta):
     >>>     return ee_glm(theta, X=X, y=d['Y1'],
@@ -364,15 +348,11 @@ def ee_mlogit(theta, X, y, weights=None, offset=None):
     r"""Estimating equation for multinomial logistic regression. This estimating equation functionality supports
     unranked categorical outcome data, unlike ``ee_regression`` and ``ee_glm``.
 
-    Note
-    ----
     Unlike the other regression estimating equations, ``ee_mlogit`` expects a matrix of indicators for each possible
     value of ``y``, with the first column being used as the referent category. In other words, the outcome variable is
     a matrix of dummy variables that includes the reference.
-
-
-    The estimating equation for column :math:`r` of the indicator variable :math:`Y_{r}`
-    of a :math:`Y` with :math:`k` unique categories is
+    The estimating equation for column :math:`r` of the indicator variable :math:`Y_{r}` of a :math:`Y` with :math:`k`
+    unique categories is
 
     .. math::
 
@@ -380,35 +360,30 @@ def ee_mlogit(theta, X, y, weights=None, offset=None):
         X_i = 0
 
     where :math:`\theta_r` are the coefficients correspond to the log odds ratio comparing :math:`Y_r` to all other
-    categories of :math:`Y`. Here, :math:`\theta` is a 1-by-(b :math`\times` (k-1)) array, where b is the distinct
-    covariates included as part of X. So, the stack of estimating equations consists of :math:`(k-1)` estimating
-    equations of the dimension :math:`X_i`. For example, if X is a 3-by-n matrix and :math:`Y` has three unique
+    categories of :math:`Y`. Here, :math:`\theta` is a 1-by-(`b` :math`\times` (`k`-1)) array, where `b` is the distinct
+    covariates included as part of ``X``. So, the stack of estimating equations consists of (`k`-1) estimating
+    equations of the dimension :math:`X_i`. For example, if X is a 3-by-`n` matrix and :math:`Y` has three unique
     categories, then :math:`\theta` will be a 1-by-6 array.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b :math:`\times` (k-1) values. Therefore, initial values should consist of the
-        same number as the number of columns present in the design matrix for each category of the outcome matrix
+        Theta in this case consists of `b` :math:`\times` (`k`-1) values. Therefore, initial values should consist of
+        the same number as the number of columns present in the design matrix for each category of the outcome matrix
         besides the reference.
     X : ndarray, list, vector
-        2-dimensional design matrix of n observed covariates for b variables.
+        2-dimensional design matrix of `n` observed covariates for `b` variables.
     y : ndarray, list, vector
-        2-dimensional indicator matrix of n observed outcomes.
+        2-dimensional indicator matrix of `n` observed outcomes.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a (b*(k-1))-by-n NumPy array evaluated for the input ``theta``
+        Returns a (`b` :math:`\times` (`k`-1))-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -532,46 +507,41 @@ def ee_robust_regression(theta, X, y, model, k, loss='huber', weights=None, uppe
     occurring is zero.
 
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options include: ``'linear'`` (linear regression).
     k : int, float
         Tuning or hyperparameter for the chosen loss function. Notice that the choice of hyperparameter should depend
         on the chosen loss function.
     loss : str, optional
-        Robust loss function to use. Default is 'huber'. Options include 'andrew', 'hampel', 'huber', 'tukey'.
+        Robust loss function to use. Default is ``'huber'``. Options include ``'andrew'``, ``'hampel'``, ``'tukey'``.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     lower : int, float, None, optional
-        Lower parameter for the 'hampel' loss function. This parameter does not impact the other loss functions.
+        Lower parameter for the Hampel loss function. This parameter does not impact the other loss functions.
         Default is ``None``.
     upper : int, float, None, optional
-        Upper parameter for the 'hampel' loss function. This parameter does not impact the other loss functions.
+        Upper parameter for the Hampel loss function. This parameter does not impact the other loss functions.
         Default is ``None``.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -668,9 +638,9 @@ def ee_ridge_regression(theta, X, y, model, penalty, weights=None, center=0., of
 
     where :math:`\lambda` is the penalty term.
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Note
     ----
@@ -680,38 +650,37 @@ def ee_ridge_regression(theta, X, y, model, penalty, weights=None, center=0., of
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     penalty : int, float, ndarray, list, vector
         Penalty term to apply to all coefficients (if only a integer or float is provided) or the corresponding
         coefficient (if a list or vector of integers or floats is provided). Note that the penalty term should either
-        consists of a single value or b values (to match the length of ``theta``). The penalty is scaled by n.
+        consists of a single value or `b` values (to match the length of ``theta``). The penalty is scaled by `n`.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     center : int, float, ndarray, list, vector, optional
         Center or reference value to penalized estimated coefficients towards. Default is ``0``, which penalized
         coefficients towards the null. Other center values can be specified for all coefficients (by providing an
         integer or float) or covariate-specific centering values (by providing a vector of values of the same length as
         X).
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
-    Construction of a estimating equation(s) with ``ee_ridge_regression`` should be done similar to the
-    following
+    Construction of a estimating equation(s) with ``ee_ridge_regression`` should be done similar to the following
 
     >>> import numpy as np
     >>> import pandas as pd
@@ -796,12 +765,6 @@ def ee_lasso_regression(theta, X, y, model, penalty, epsilon=3.e-3, weights=None
     r"""Estimating equation for an approximate LASSO (least absolute shrinkage and selection operator) regressor. LASSO
     regression applies an L1-regularization through a magnitude penalty.
 
-    Note
-    ----
-    As the derivative of the estimating equation for LASSO is not defined at :math:`\theta=0`, the bread (and sandwich)
-    cannot be used to estimate the variance in all settings.
-
-
     The estimating equation for the approximate LASSO linear regression is
 
     .. math::
@@ -811,13 +774,19 @@ def ee_lasso_regression(theta, X, y, model, penalty, epsilon=3.e-3, weights=None
 
     where :math:`\lambda` is the penalty term.
 
+    Note
+    ----
+    As the derivative of the estimating equation for LASSO is not defined at :math:`\theta=0`, the bread (and sandwich)
+    cannot be used to estimate the variance in all settings.
+
+
     Here, an approximation based on the bridge penalty for the LASSO is used. For the bridge penalty, LASSO is the
     special case where :math:`\epsilon = 0`. By making :math:`\epsilon > 0`, we can approximate the LASSO. The true
     LASSO may not be possible to implement due to the existence of multiple solutions
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Note
     ----
@@ -828,36 +797,36 @@ def ee_lasso_regression(theta, X, y, model, penalty, epsilon=3.e-3, weights=None
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     penalty : int, float, ndarray, list, vector
         Penalty term to apply to all coefficients (if only a integer or float is provided) or the corresponding
         coefficient (if a list or vector of integers or floats is provided). Note that the penalty term should either
-        consists of a single value or b values (to match the length of ``theta``).  The penalty is scaled by n.
+        consists of a single value or `b` values (to match the length of ``theta``).  The penalty is scaled by `n`.
     epsilon : float, optional
         Approximation error to use for the LASSO approximation. Default argument is ``0.003``, which results in a
-        bridge penalty of 1.0003.
+        bridge penalty of ``1.0003``.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     center : int, float, ndarray, list, vector, optional
         Center or reference value to penalized estimated coefficients towards. Default is ``0``, which penalized
         coefficients towards the null. Other center values can be specified for all coefficients (by providing an
         integer or float) or covariate-specific centering values (by providing a vector of values of the same length as
         X).
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -949,12 +918,6 @@ def ee_elasticnet_regression(theta, X, y, model, penalty, ratio, epsilon=3.e-3,
     pre-specified ratio. Notice that the L1 penalty is based on an approximation. See ``ee_lasso_regression`` for
     further details on the approximation for the L1 penalty.
 
-    Note
-    ----
-    As the derivative of the estimating equation for LASSO is not defined at :math:`\theta=0`, the bread (and sandwich)
-    cannot be used to estimate the variance in all settings.
-
-
     The estimating equation for Elastic-Net linear regression with the approximate L1 penalty is
 
     .. math::
@@ -964,9 +927,15 @@ def ee_elasticnet_regression(theta, X, y, model, penalty, ratio, epsilon=3.e-3,
 
     where :math:`\lambda` is the penalty term and :math:`r` is the ratio for the L1 vs L2 penalty.
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
+    Note
+    ----
+    As the derivative of the estimating equation for LASSO is not defined at :math:`\theta=0`, the bread (and sandwich)
+    cannot be used to estimate the variance in all settings.
+
+
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Note
     ----
@@ -976,25 +945,25 @@ def ee_elasticnet_regression(theta, X, y, model, penalty, ratio, epsilon=3.e-3,
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     penalty : int, float, ndarray, list, vector
         Penalty term to apply to all coefficients (if only a integer or float is provided) or the corresponding
         coefficient (if a list or vector of integers or floats is provided). Note that the penalty term should either
-        consists of a single value or b values (to match the length of ``theta``). The penalty is scaled by n.
+        consists of a single value or `b` values (to match the length of ``theta``). The penalty is scaled by `n`.
     ratio : float
         Ratio for the L1 vs L2 penalty in Elastic-net. The ratio must be be :math:`0 \le r \le 1`. Setting ``ratio=1``
         results in LASSO and ``ratio=0`` results in ridge regression.
     epsilon : float, optional
         Approximation error to use for the LASSO approximation. Default argument is ``0.003``, which results in a
-        bridge penalty of 1.0003.
+        bridge penalty of ``1.0003``.
     weights : ndarray, list, vector, None, optional
         1-dimensional vector of n weights. Default is ```None``, which assigns a weight of 1 to all observations.
     center : int, float, ndarray, list, vector, optional
@@ -1003,12 +972,12 @@ def ee_elasticnet_regression(theta, X, y, model, penalty, ratio, epsilon=3.e-3,
         integer or float) or covariate-specific centering values (by providing a vector of values of the same length as
         X).
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input theta and y
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -1128,9 +1097,9 @@ def ee_bridge_regression(theta, X, y, model, penalty, gamma, weights=None, cente
 
     where :math:`\lambda` is the penalty term and :math:`\gamma` is a tuning parameter.
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Note
     ----
@@ -1140,19 +1109,19 @@ def ee_bridge_regression(theta, X, y, model, penalty, gamma, weights=None, cente
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     penalty : int, float, ndarray, list, vector
         Penalty term to apply to all coefficients (if only a integer or float is provided) or the corresponding
         coefficient (if a list or vector of integers or floats is provided). Note that the penalty term should either
-        consists of a single value or b values (to match the length of ``theta``). The penalty is scaled by n.
+        consists of a single value or `b` values (to match the length of ``theta``). The penalty is scaled by `n`.
     gamma : float
         Hyperparameter for the bridge penalty, defined for :math:`\gamma > 0`. However, only :math:`\gamma \ge 1` are
         supported.
@@ -1164,12 +1133,12 @@ def ee_bridge_regression(theta, X, y, model, penalty, gamma, weights=None, cente
         integer or float) or covariate-specific centering values (by providing a vector of values of the same length as
         X).
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -1304,44 +1273,44 @@ def ee_additive_regression(theta, X, y, specifications, model, weights=None, off
     ``ee_additive_regression`` through setting the knot locations.
 
 
-    Here, :math:`\theta` is a 1-by-(b+k) array, where b is the distinct covariates included as part of X and the k
-    distinct spline basis functions. For example, if X is a 2-by-n matrix with a 10-knot natural spline for the second
-    column in X, then :math:`\theta` will be a 1-by-(2+9) array. The code is general to allow for an arbitrary
-    number of X variables and spline knots.
+    Here, :math:`\theta` is a 1-by-(`b`+`k`) array, where `b` is the distinct covariates included as part of ``X`` and
+    the `k` distinct spline basis functions. For example, if ``X`` is a 2-by-`n` matrix with a 10-knot natural spline
+    for the second column in X, then :math:`\theta` will be a 1-by-(2+9) array. The code is general to allow for an
+    arbitrary number of X variables and spline knots.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Parameter values. Number of values should match the number of columns in the additive design matrix.
+        Theta in this case consists of `b`+`k` values. Number of values should match the number of columns in the
+        additive design matrix.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     specifications : list, dict, None
         A list of dictionaries that define the hyperparameters for the spline (e.g., number of knots, strength of
         penalty). For terms that should not have splines, ``None`` should be specified instead (see examples below).
-        Each dictionary supports the following parameters:
-        "knots", "natural", "power", "penalty"
-        * knots (list): controls the position of the knots, with knots are placed at given locations. There is no
-            default, so must be specified by the user.
-        * natural (bool): controls whether to generate natural (restricted) or unrestricted splines.
-            Default is ``True``, which corresponds to natural splines.
-        * power (float): controls the power to raise the spline terms to. Default is 3, which corresponds to cubic
-            splines.
-        * penalty (float): penalty term (:math:`\lambda`) applied to each corresponding spline basis term. Default is 0,
-            which applies no penalty to the spline basis terms.
+        Each dictionary supports the following parameters: "knots", "natural", "power", "penalty"
+        knots (list): controls the position of the knots, with knots are placed at given locations. There is no
+        default, so must be specified by the user.
+        natural (bool): controls whether to generate natural (restricted) or unrestricted splines.
+        Default is ``True``, which corresponds to natural splines.
+        power (float): controls the power to raise the spline terms to. Default is 3, which corresponds to cubic
+        splines.
+        penalty (float): penalty term (:math:`\lambda`) applied to each corresponding spline basis term. Default is 0,
+        which applies no penalty to the spline basis terms.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a (b+k)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (`b`+`k`)-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -1572,6 +1541,15 @@ def _model_transform_(model, assert_linear_model=False):
 
 
 def _inverse_link_(betax, link):
+    """Internal function to return the evaluated inverse link and derivative of the inverse link.
+
+    Parameters
+    ----------
+    betax : ndarray, list, array
+        Parameter values
+    link : str
+        Specified link function
+    """
     # Distributions not implemented: power, inverse power
     if link == 'identity':
         py = identity(betax)                    # Inverse link
@@ -1612,6 +1590,19 @@ def _inverse_link_(betax, link):
 
 
 def _distribution_variance_(dist, mu, hyperparameter=None, alpha=None):
+    """Internal function to return the distribution variance for GLM specifications.
+
+    Parameters
+    ----------
+    dist : str
+        Distribution
+    mu :
+        Prediction
+    hyperparameter : int, float, None, optional
+        Hyperparameter for the Tweedie distribution
+    alpha : int, float, None, optional
+        Hyperparameter for gamma or negative-binomial
+    """
     if dist in ['normal', 'gaussian']:
         v = 1
     elif dist == 'poisson':
diff --git a/delicatessen/estimating_equations/survival.py b/delicatessen/estimating_equations/survival.py
index f853ac3..2283e1b 100644
--- a/delicatessen/estimating_equations/survival.py
+++ b/delicatessen/estimating_equations/survival.py
@@ -27,26 +27,20 @@ def ee_exponential_model(theta, t, delta):
 
         h(t) = \lambda
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
-
     Parameters
     ----------
     theta : ndarray, list, vector
         Theta in the case of the exponential model consists of a single value. Furthermore, the parameter will be
         non-negative. Therefore, an initial value like the ``[1, ]`` should be provided.
     t : ndarray, list, vector
-        1-dimensional vector of n observed times.
+        1-dimensional vector of `n` observed times.
     delta : ndarray, list, vector
-        1-dimensional vector of n event indicators, where 1 indicates an event and 0 indicates right censoring.
+        1-dimensional vector of `n` event indicators, where 1 indicates an event and 0 indicates right censoring.
 
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input ``theta``
+        Returns a 1-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -121,28 +115,22 @@ def ee_weibull_model(theta, t, delta):
 
         h(t) = \lambda \gamma t^{\gamma - 1}
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in the case of the exponential model consists of a single value. Furthermore, the parameter will be
+        Theta in the case of the Weibull model consists of two values. Furthermore, the parameter will be
         non-negative. Therefore, an initial value like the ``[1, ]`` is recommended.
     t : ndarray, list, vector
-        1-dimensional vector of n observed times. No missing data should be included (missing data may cause
+        1-dimensional vector of `n` observed times. No missing data should be included (missing data may cause
         unexpected behavior).
     delta : ndarray, list, vector
-        1-dimensional vector of n event indicators, where 1 indicates an event and 0 indicates right censoring. No
+        1-dimensional vector of `n` event indicators, where 1 indicates an event and 0 indicates right censoring. No
         missing data should be included (missing data may cause unexpected behavior).
 
     Returns
     -------
     array :
-        Returns a 2-by-n NumPy array evaluated for the input theta.
+        Returns a 2-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -254,7 +242,7 @@ def ee_exponential_measure(theta, times, n, measure, scale):
     Returns
     -------
     array :
-        Returns a t-by-n NumPy array evaluated for the input ``theta``
+        Returns a `t`-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -376,8 +364,8 @@ def ee_weibull_measure(theta, times, n, measure, scale, shape):
     Parameters
     ----------
     theta : ndarray, list, vector
-        theta consists of t values. The initial values should consist of the same number of elements as provided in the
-        ``times`` argument.
+        theta consists of `t` values. The initial values should consist of the same number of elements as provided in
+        the ``times`` argument.
     times : int, float, ndarray, list, vector
         A single time or 1-dimensional collection of times to calculate the measure at. The number of provided times
         should consist of the same number of elements as provided in the ``theta`` argument.
@@ -396,7 +384,7 @@ def ee_weibull_measure(theta, times, n, measure, scale, shape):
     Returns
     -------
     array :
-        Returns a t-by-n NumPy array evaluated for the input ``theta``
+        Returns a `t`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -546,34 +534,29 @@ def ee_aft_weibull(theta, X, t, delta, weights=None):
     following relation between the coefficients: :math:`\lambda = - \mu \gamma`,
     :math:`\beta_{PH} = - \beta_{AFT} \gamma`, and :math:`\gamma = \exp(\sigma)`.
 
-    Here, :math:`\theta` is a 1-by-(2+b) array, where b is the distinct covariates included as part of X. For example,
-    if X is a 3-by-n matrix, then theta will be a 1-by-5 array. The code is general to allow for an arbitrary number of
-    X's (as long as there is enough support in the data).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
+    Here, :math:`\theta` is a 1-by-(2+`b`) array, where `b` is the distinct covariates included as part of ``X``. For
+    example, if ``X`` is a 3-by-`n` matrix, then theta will be a 1-by-5 array. The code is general to allow for an
+    arbitrary dimension of ``X``.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        theta consists of 1+b+1 values. Therefore, initial values should consist of the same number as the number of
+        theta consists of 1+`b`+1 values. Therefore, initial values should consist of the same number as the number of
         columns present in ``X`` plus 2. This can easily be implemented via
         ``[0, ] + [0, ] * X.shape[1] + [0, ]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     t : ndarray, list, vector
-        1-dimensional vector of n observed times.
+        1-dimensional vector of `n` observed times.
     delta : ndarray, list, vector
-        1-dimensional vector of n values indicating whether the time was an event or censoring.
+        1-dimensional vector of `n` values indicating whether the time was an event or censoring.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``.
+        Returns a 1+`b`+1-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -711,13 +694,13 @@ def ee_aft_weibull_measure(theta, times, X, measure, mu, beta, sigma):
     Parameters
     ----------
     theta : ndarray, list, vector
-        theta consists of t values. The initial values should consist of the same number of elements as provided in the
+        theta consists of `t` values. The initial values should consist of the same number of elements as provided in the
         ``times`` argument.
     times : int, float, ndarray, list, vector
         A single time or 1-dimensional collection of times to calculate the measure at. The number of provided times
         should consist of the same number of elements as provided in the ``theta`` argument.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     measure : str
         Measure to calculate. Options include survival (``'survival'``), density (``'density'``), risk or the cumulative
         density (``'risk'``), hazard (``'hazard'``), or cumulative hazard (``'cumulative_hazard'``).
@@ -731,7 +714,7 @@ def ee_aft_weibull_measure(theta, times, X, measure, mu, beta, sigma):
     Returns
     -------
     array :
-        Returns a t-by-n NumPy array evaluated for the input theta
+        Returns a `t`-by-`n` NumPy array evaluated for the input theta
 
     Examples
     --------
diff --git a/delicatessen/mestimation.py b/delicatessen/mestimation.py
index 7b87d2c..f272542 100644
--- a/delicatessen/mestimation.py
+++ b/delicatessen/mestimation.py
@@ -28,18 +28,20 @@ class MEstimator:
     ----
     Estimating equations are advantageous in both theoretical and applied research. They simplifies proofs of
     consistency and asymptotic normality of estimators under a large-sample approximation framework. In application,
-    this approach to esitmation simplifies estimation of the variance of parameters and automates the delta-method.
+    this approach simplifies variance estimation and automates the delta-method.
 
 
     M-Estimators consists of two broad step: point estimation and variance estimation. Point estimation is carried out
-    by determining the values of :math:`\theta` where the sum of the estimating equations are zero. For variance
-    estimation, the asymptotic sandwich variance estimator is used, which consists of
+    by determining the values of :math:`\theta` where the sum of the estimating equations are zero. This is done via
+    standard root-finding algorithms.
+
+    For variance estimation, sandwich variance estimator is used. The asymptotic sandwich variance estimator consists of
 
     .. math::
 
-        B_n(O, \hat{\theta})^{-1} F_n(O, \hat{\theta}) \left\{B_n(O, \hat{\theta}^{-1})\right\}^T
+        V_n(O, \hat{\theta}) = B_n(O, \hat{\theta})^{-1} F_n(O, \hat{\theta}) \left\{B_n(O, \hat{\theta}^{-1})\right\}^T
 
-    where :math:`B` is the 'bread' and :math:`F` is the 'filling'
+    where :math:`B` is the 'bread' and :math:`F` is the 'filling' matrix. These matrices are defined as
 
     .. math::
 
@@ -49,31 +51,33 @@ class MEstimator:
 
         F_n(O, \hat{\theta}) = n^{-1} \sum_{i=1}^{n} \psi(O_i, \hat{\theta}) \psi(O_i, \hat{\theta})^T
 
-    The partial derivatives for the bread are calculated using either numerical approximation (e.g., forward difference
-    method) or forward-mode automatic differentiation. Inverting the bread is done via NumPy's ``linalg.pinv``. For
-    the filling, the dot product is taken at :math:`\hat{\theta}`.
+    respectively. The partial derivatives for the bread are calculated using either numerical approximation (e.g.,
+    forward difference method) or forward-mode automatic differentiation. Inverting the bread is done via NumPy's
+    ``linalg.pinv``. For the filling, the dot product is taken at :math:`\hat{\theta}`.
 
     Note
     ----
-    A hard part, that must be done by the user, is to specify the estimating equations. Be sure to check the provided
-    examples for the expected format. Pre-built estimating equations for common problems are also made available.
+    The difficult part (that must be done by the user) is to specify the estimating equations. Be sure to check the
+    provided examples for the expected format. Pre-built estimating equations for common problems are also made
+    available.
 
 
     After completion of these steps, point and variance estimates are stored. These can be extracted from
-    ``MEstimator``.
+    ``MEstimator``. Further, confidence intervals, Z-scores, P-values, or S-values can all be generated.
 
     Note
     ----
     For complex regression problems, the root-finding algorithms are not as robust relative to maximization approaches.
-    A solution for difficult problems is to 'pre-wash' the initial values.
+    A simple solution for difficult problems is to 'pre-wash' or find the solution to the equations and provide those
+    as the initial starting values.
 
     Parameters
     ----------
     stacked_equations : function, callable
-        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        Function that returns a `v`-by-`n` NumPy array of the estimating equations. See provided examples in the
         documentation for how to construct a set of estimating equations.
     init : list, set, array
-        Initial values for the root-finding algorithm.
+        Initial values for the root-finding algorithm. A total of `v` values should be provided.
     subset : list, set, array, None, optional
         Optional argument to conduct the root-finding procedure on a subset of parameters in the estimating equations.
         The input list is used to location index the parameter array via ``np.take()``. The subset list will
@@ -82,9 +86,10 @@ class MEstimator:
 
     Note
     ----
-    Because the root-finding procedure is NOT ran for parameters outside of the subset, those coefficients must be
-    solved outside of ``MEstimator``. In general, I do NOT recommend using the ``subset`` argument unless a series of
-    complex estimating equations need to be solved.
+    Because the root-finding procedure is NOT ran for parameters outside of the subset, those coefficients *must* be
+    solved outside of ``MEstimator``. In general, I do *NOT* recommend using the ``subset`` argument unless a series of
+    complex estimating equations need to be solved. In general, this argument does not massively improve speed until
+    the estimating equations consist of hundreds of parameters.
 
     Examples
     --------
@@ -115,6 +120,7 @@ class MEstimator:
     >>> estr.variance                               # Covariance
     >>> estr.asymptotic_variance                    # Asymptotic covariance
     >>> np.sqrt(np.diag(estr.asymptotic_variance))  # Standard deviation
+    >>> estr.variance                               # Covariance
     >>> np.sqrt(np.diag(estr.variance))             # Standard error
     >>> estr.confidence_intervals()                 # Confidence intervals
     >>> estr.z_scores()                             # Z-scores
@@ -122,9 +128,9 @@ class MEstimator:
     >>> estr.s_values()                             # S-values
 
     Alternatively, a custom estimating equation can be specified. This is done by constructing a valid estimating
-    equation for the ``MEstimator``. The ``MEstimator`` expects the ``psi`` function to return a b-by-n array, where b
-    is the number of parameters (length of ``theta``) and n is the total number of observations. Below is an example
-    of the mean and variance estimating equation from before
+    equation for the ``MEstimator``. The ``MEstimator`` expects the ``psi`` function to return a `v`-by-`n` array,
+    where `v` is the number of parameters (length of ``theta``) and n is the total number of observations. Below is an
+    example of the mean and variance estimating equation from before, but implemented by-hand
 
     >>> def psi(theta):
     >>>     y = np.array(y_dat)
@@ -137,7 +143,7 @@ class MEstimator:
     >>> estr = MEstimator(stacked_equations=psi, init=[0, 0, ])
     >>> estr.estimate()
 
-    Note that ``len(init)`` should be equal to b. So in this case, two initial values are provided.
+    Note that ``len(init)`` should be equal to `v`. So in this case, two initial values are provided.
 
     ``MEstimator`` can also be run with a user-provided root-finding algorithm. To specify a custom root-finder, a
     function must be created by the user that consists of two keyword arguments (``stacked_equations``, ``init``) and
@@ -164,7 +170,10 @@ class MEstimator:
     Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
     (pp. 297-337). Springer, New York, NY.
 
-    Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. The American Statistician, 56(1), 29-38.
+    Ross RK, Zivich PN, Stringer JSA, & Cole SR. (2024). M-estimation for common epidemiological measures: introduction
+    and applied examples. *International Journal of Epidemiology*, 53(2), dyae030.
+
+    Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. *The American Statistician*, 56(1), 29-38.
     """
     def __init__(self, stacked_equations, init=None, subset=None):
         self.stacked_equations = stacked_equations     # User-input stacked estimating equations
@@ -183,36 +192,38 @@ def __init__(self, stacked_equations, init=None, subset=None):
         self.asymptotic_variance = None   # Asymptotic covariance matrix for theta values (calculated later)
 
     def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='approx', dx=1e-9, allow_pinv=True):
-        """Function to carry out the point and variance estimation of theta. After this procedure, the point estimates
-        (in ``theta``) and the covariance matrix (in ``variance``) can be extracted.
+        """Run the point and variance estimation procedures for given estimating equation and starting values. This
+        function carries out the point and variance estimation of ``theta``. After this procedure, the point estimates
+        (in ``theta``) and the covariance matrix (in ``variance``) can be extracted from the ``MEstimator`` object.
 
         Parameters
         ----------
         solver : str, function, callable, optional
             Method to use for the root-finding procedure. Default is the Levenberg-Marquardt algorithm
-            (``scipy.optimize.root(method='lm')``). Other built-in option is the secant method
-            (``scipy.optimize.newton``), and a modification of the Powell hybrid method
-            (``scipy.optimize.root(method='hybr')``). Finally, any generic root-finding algorithm can be used via a
-            user-provided callable object. The function must consist of two keyword arguments: ``stacked_equations``,
-            and ``init``. Additionally, the function should return only the optimized values. Please review the
-            provided example in the documentation for how to implement a custom root-finding algorithm.
+            (``scipy.optimize.root(method='lm')``, specified by ``solver='lm'``). Other built-in option is the secant
+            method (``scipy.optimize.newton``, specified by ``solver='newton'``), and a modification of the Powell
+            hybrid method (``scipy.optimize.root(method='hybr')``, specified by ``solver='hybr'``). Finally, any generic
+            root-finding algorithm can be used via a user-provided callable object. The function must consist of two
+            keyword arguments: ``stacked_equations``, and ``init``. Additionally, the function should return only the
+            optimized values. Please review the provided example in the documentation for how to implement a custom
+            root-finding algorithm.
         maxiter : int, optional
             Maximum iterations to consider for the root finding procedure. Default is 5000 iterations. For complex
-            estimating equations (without preceding optimization), this value may need to be increased. This argument
-            is not used for user-specified solvers.
+            estimating equations, this value may need to be increased. This argument is not used when a custom
+            root-finding method (e.g., ``solver``) is provided.
         tolerance : float, optional
-            Maximum tolerance for errors in the root finding. This argument is passed ``scipy.optimize`` via the
-            ``tol`` parameter. This argument is not used for user-specified solvers. Default is 1e-9.
+            Maximum tolerance for errors in the root finding in ``scipy.optimize``. Default is 1e-9. This argument is
+            not used when a custom root-finding method (e.g., ``solver``) is provided.
         deriv_method : str, optional
-            Method to compute the derivative of the estimating equations for the bread matrix. Options include numerical
-            approximation via the forward difference method via SciPy (``'approx'``), forward difference implemented
-            by-hand (`'fapprox'`), backward difference implemented by-hand (`'bapprox'`),  central difference
-            implemented by-hand (`'capprox'`), or forward-mode automatic differentiation (``'exact'``).
-            Default is ``'approx'``.
+            Method to compute the derivative of the estimating equations for the bread matrix. Default is ``'approx'``.
+            Options include numerical approximation via the forward difference method via SciPy (``'approx'``), forward
+            difference as implemented in delicatessen (`'fapprox'`), backward difference as implemented in delicatessen
+            (`'bapprox'`), central difference implemented as in delicatessen (`'capprox'`), or forward-mode automatic
+            differentiation as implemented in delicatessen(``'exact'``).
         dx : float, optional
-            Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
-            for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-            used with numerical approximation methods. Default is 1e-9.
+            Spacing to use to numerically approximate the partial derivatives of the bread matrix. Default is 1e-9.
+            Here, a small value for ``dx`` should be used, since some large values can result in poor approximations.
+            This argument is only used with numerical approximation methods.
         allow_pinv : bool, optional
             Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
             non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this
@@ -322,8 +333,8 @@ def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='appr
             self.variance = self.asymptotic_variance / self.n_obs
 
     def confidence_intervals(self, alpha=0.05):
-        r"""Calculate Wald-type :math:`(1 - \alpha) \times` 100% confidence intervals using the point estimates and
-        the sandwich variance. The formula for the confidence intervals are
+        r"""Calculate two-sided Wald-type :math:`(1 - \alpha) \times` 100% confidence intervals using the point
+        and sandwich variance estimates. The formula for the confidence intervals is
 
         .. math::
 
@@ -372,7 +383,7 @@ def z_scores(self, null=0):
             \frac{\hat{\theta} - \theta}{\widehat{SE}(\hat{\theta})}
 
         where :math:`\theta` is the null. The ``.estimate()`` function must be called before the Z-scores can be
-        calculated.
+        calculated. Note that the default value for the null is zero.
 
         Parameters
         ----------
@@ -418,13 +429,8 @@ def p_values(self, null=0):
     def s_values(self, null=0):
         r"""Calculate two-sided Wald-type S-values using the point estimates and the sandwich variance. The S-value,
         or Shannon Information value, is a transformation of the P-values that has been argued to be more easily
-        interpretable as it can be related back to simple coin-flipping scenarios.
-
-        Suppose the S-value is :math:`s`. Then :math:`s` corresponds to the number of heads in a row with a fair coin
-        (equal chances heads or tails). As :math:`s` increases, one would be more 'surprised' by the result (e.g., it
-        might not be surprising to have two heads in a row, but it would be surprising for 10 in a row).
-
-        The transformation from a P-value into a S-value is.
+        interpretable as it can be related back to simple coin-flipping scenarios. The transformation from a P-value
+        into a S-value is.
 
         .. math::
 
@@ -433,6 +439,11 @@ def s_values(self, null=0):
         where :math:`P` is the corresponding P-value. The ``.estimate()`` function must be called before the S-values
         can be calculated.
 
+        The S-value can be contextualized in terms of coin-flips. Suppose the S-value is :math:`s`. Then :math:`s`
+        corresponds to the number of heads in a row with a fair coin (equal chances heads or tails). As :math:`s`
+        increases, one would be more 'surprised' by the result (e.g., it might not be surprising to have 2 heads in a
+        row, but it would be surprising for 20 in a row).
+
         Parameters
         ----------
         null: int, float, ndarray, optional
@@ -526,8 +537,8 @@ def _mestimator_sum_(stacked_equations, subset):
 
     @staticmethod
     def _solve_coefficients_(stacked_equations, init, method, maxiter, tolerance):
-        """Quasi-Newton solver for the values of theta, such that the estimating equations are equal to zero. Default
-        uses the secant method from SciPy's `newton` optimizer.
+        """Calls the root-finding procedure for the values of theta, such that the estimating equations are equal to
+        zero. Default uses the Levenberg-Marquardt algorithm from SciPy.
 
         Parameters
         ----------
@@ -589,11 +600,9 @@ def _solve_coefficients_(stacked_equations, init, method, maxiter, tolerance):
                 raise ValueError("The user-specified root-finding `solver` must return the solution to the "
                                  "optimization")
         else:
-            raise ValueError("The solver '" +  # ... otherwise throw ValueError
-                             str(method) +
-                             "' is not available. Please see the "
-                             "documentation for valid"
-                             "options for the optimizer.")
+            # ... otherwise throw ValueError if no other root-finding steps are triggered.
+            raise ValueError("The solver '" +  str(method) + "' is not available. Please see the "
+                             "documentation for valid options for the optimizer.")
 
         # Return optimized theta array
         return psi
diff --git a/delicatessen/sandwich.py b/delicatessen/sandwich.py
index e337345..1fcba4b 100644
--- a/delicatessen/sandwich.py
+++ b/delicatessen/sandwich.py
@@ -24,23 +24,31 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
 
         V_n(O_i; \theta) = B_n(O_i; \theta)^{-1} F_n(O_i; \theta) \left[ B_n(O_i; \theta)^{-1} \right]^{T}
 
-    where :math:`B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial}{\partial \theta} \psi(O_i; \theta)`,
-    :math:`F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T`, and :math:`\psi(O_i; \theta)` is the
-    estimating function.
+    where :math:`\psi(O_i; \theta)` is the estimating function,
+
+    .. math::
+
+        B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial}{\partial \theta} \psi(O_i; \theta),
+
+    and
+
+    .. math::
+
+        F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T .
 
     To compute the bread matrix, :math:`B_n`, the matrix of partial derivatives is computed by using either finite
     difference methods or automatic differentiation. For finite differences, the default is to use SciPy's
-    ``approx_fprime`` functionality, which uses forward finite differences. However, you can also use homebrew version
-    that allows for forward, backward, and center differences. Automatic differentiation is also supported by a
-    homebrew version.
+    ``approx_fprime`` functionality, which uses forward finite differences. However, you can also use the delicatessen
+    homebrew version that allows for forward, backward, and center differences. Automatic differentiation is also
+    supported by a homebrew version.
 
     To compute the meat matrix, :math:`F_n`, only linear algebra methods, implemented through NumPy, are necessary.
-    The sandwich is then constructed from these individual pieces.
+    The sandwich is then constructed from these pieces using linear algebra methods from NumPy.
 
     Parameters
     ----------
     stacked_equations : function, callable
-        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        Function that returns a `v`-by-`n` NumPy array of the estimating equations. See provided examples in the
         documentation for how to construct a set of estimating equations.
     theta : list, set, array
         Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
@@ -53,7 +61,7 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
     dx : float, optional
         Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
         for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-        used with numerical approximation methods. Default is 1e-9.
+        used with numerical approximation methods. Default is ``1e-9``.
     allow_pinv : bool, optional
         Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
         non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this
@@ -62,7 +70,7 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
     Returns
     -------
     array :
-        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+        Returns a `p`-by-`p` NumPy array for the input ``theta``, where ``p = len(theta)``
 
     Examples
     --------
@@ -76,7 +84,7 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
     >>> y_dat = [1, 2, 4, 1, 2, 3, 1, 5, 2]
 
     The following is an illustration of how to compute sandwich covariance using only an estimating equation and the
-    paramter values. The mean and variance (that correspond to ``ee_mean_variance``) can be computed using NumPy by
+    parameter values. The mean and variance (that correspond to ``ee_mean_variance``) can be computed using NumPy by
 
     >>> mean = np.mean(y_dat)
     >>> var = np.var(y_dat, ddof=0)
@@ -96,11 +104,18 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
 
     >>> sandwich = sandwich_asymp / len(y_dat)
 
+    The standard errors are then
+
+    >>> se = np.sqrt(np.diag(sandwich))
+
     References
     ----------
     Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
     (pp. 297-337). Springer, New York, NY.
 
+    Ross RK, Zivich PN, Stringer JSA, & Cole SR. (2024). M-estimation for common epidemiological measures: introduction
+    and applied examples. *International Journal of Epidemiology*, 53(2), dyae030.
+
     Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. The American Statistician, 56(1), 29-38.
     """
     # Evaluating at provided theta values
@@ -132,21 +147,23 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
 
 
 def compute_bread(stacked_equations, theta, deriv_method, dx=1e-9):
-    """Function to compute the bread matrix. The bread matrix is defined as
+    r"""Function to compute the bread matrix. The bread matrix is defined as
 
     .. math::
 
-        B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial \psi(O_i; \theta)}{\partial \theta}
+        B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial }{\partial \theta} \psi(O_i; \theta)
 
-    The matrix of partial derivatives is computed by using either finite difference methods or automatic
-    differentiation. For finite differences, the default is to use SciPy's ``approx_fprime`` functionality, which uses
-    forward finite differences. However, you can also use homebrew version that allows for forward, backward, and
-    center differences. Automatic differentiation is also supported by a homebrew version.
+    where :math:`\psi(O_i; \theta)` is the estimating function.
+    To compute the bread matrix, :math:`B_n`, the matrix of partial derivatives is computed by using either finite
+    difference methods or automatic differentiation. For finite differences, the default is to use SciPy's
+    ``approx_fprime`` functionality, which uses forward finite differences. However, you can also use the delicatessen
+    homebrew version that allows for forward, backward, and center differences. Automatic differentiation is also
+    supported by a homebrew version.
 
     Parameters
     ----------
     stacked_equations : function, callable
-        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        Function that returns a `v`-by-`n` NumPy array of the estimating equations. See provided examples in the
         documentation for how to construct a set of estimating equations.
     theta : list, set, array
         Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
@@ -159,12 +176,12 @@ def compute_bread(stacked_equations, theta, deriv_method, dx=1e-9):
     dx : float, optional
         Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
         for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-        used when numerical approximation methods. Default is 1e-9.
+        used when numerical approximation methods. Default is ``1e-9``.
 
     Returns
     -------
     array :
-        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+        Returns a `p`-by-`p` NumPy array for the input ``theta``, where ``p = len(theta)``
     """
     def estimating_equation(input_theta):
         if len(input_theta) == 1:
@@ -215,19 +232,20 @@ def estimating_equation(input_theta):
 
 
 def compute_meat(stacked_equations, theta):
-    """Function to compute the meat matrix. The meat matrix is defined as
+    r"""Function to compute the meat matrix. The meat matrix is defined as
 
     .. math::
 
         F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T
 
+    where :math:`\psi(O_i; \theta)` is the estimating function.
     Rather than summing over all the individual contributions, this implementation takes a single dot product of the
     stacked estimating functions. This implementation is much faster than summing over :math:`n` matrices.
 
     Parameters
     ----------
     stacked_equations : function, callable
-        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        Function that returns a `v`-by-`n` NumPy array of the estimating equations. See provided examples in the
         documentation for how to construct a set of estimating equations.
     theta : list, set, array
         Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
@@ -236,14 +254,14 @@ def compute_meat(stacked_equations, theta):
     Returns
     -------
     array :
-        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+        Returns a `p`-by-`p` NumPy array for the input ``theta``, where ``p = len(theta)``
     """
     evald_theta = np.asarray(stacked_equations(theta=theta))  # Evaluating EE at theta-hat
     return np.dot(evald_theta, evald_theta.T)                 # Return the fast dot product calculation
 
 
 def build_sandwich(bread, meat, allow_pinv=True):
-    """Function to combine the sandwich elements together. This function takes the bread and meat matrices, does the
+    r"""Function to combine the sandwich elements together. This function takes the bread and meat matrices, does the
     inversions, and then combines them together. This function is separate from ``compute_sandwich`` as it is called
     by both ``compute_sandwich`` and ``MEstimator``.
 
@@ -261,7 +279,7 @@ def build_sandwich(bread, meat, allow_pinv=True):
     Returns
     -------
     array :
-        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+        Returns a `p`-by-`p` NumPy array for the input ``theta``, where ``p = len(theta)``
     """
     # Check if there is an issue with the bread matrix
     if np.any(np.isnan(bread)):                                   # If bread contains NaN, breaks
diff --git a/delicatessen/utilities.py b/delicatessen/utilities.py
index 829cec0..3a3f263 100644
--- a/delicatessen/utilities.py
+++ b/delicatessen/utilities.py
@@ -6,7 +6,11 @@
 
 
 def logit(prob):
-    """Logistic transformation of probabilities. Returns log-odds
+    r"""Logistic transformation. Used to transform probabilities into log-odds.
+
+    .. math::
+
+        \log \left( \frac{p}{1-p} \right)
 
     Parameters
     ----------
@@ -15,13 +19,18 @@ def logit(prob):
 
     Returns
     -------
-    logit-transformed probabilities
+    array :
+        logit-transformed values
     """
     return np.log(prob / (1 - prob))
 
 
 def inverse_logit(logodds):
-    """Inverse logistic transformation. Returns probabilities
+    r"""Inverse logistic transformation. Used to transform log-odds into probabilities.
+
+    .. math::
+
+        \frac{1}{1 + \exp(o)}
 
     Parameters
     ----------
@@ -30,13 +39,14 @@ def inverse_logit(logodds):
 
     Returns
     -------
-    inverse-logit transformed results (i.e. probabilities for log-odds)
+    array :
+        inverse-logit transformed values
     """
     return 1 / (1 + np.exp(-logodds))
 
 
 def identity(value):
-    """Identity transformation. Returns itself
+    """Identity transformation. Used to transform input into itself (i.e., no transformation in applied).
 
     Note
     ----
@@ -56,7 +66,7 @@ def identity(value):
 
 
 def polygamma(n, x):
-    """Polygamma functions. This is a wrapper function of ``scipy.special.polygamma`` meant to enable automatic
+    """Polygamma function. This is a wrapper function of ``scipy.special.polygamma`` meant to enable automatic
     differentation with ``delicatessen``. When the input is a ``PrimalTangentPairs`` object, then an internal function
     that implements the polygamma function is called. Otherwise, ``scipy.special.polygamma`` is called for the input
     object.
@@ -162,12 +172,12 @@ def standard_normal_pdf(x):
 
 def robust_loss_functions(residual, loss, k, a=None, b=None):
     r"""Loss functions for robust mean and robust regression estimating equations. This function is called internally
-    for ``ee_mean_robust`` and ``ee_robust_regression``. This function can also be loaded, so user's can easily adapt
+    for ``ee_mean_robust`` and ``ee_robust_regression``. This function can also be accessed, so user's can easily adapt
     their own regression models into robust regression models using the pre-defined loss functions.
 
     Note
     ----
-    The loss functions here are technically the first-order derivatives of the loss functions
+    The loss functions here are technically the first-order derivatives of the loss functions you see in the literature.
 
 
     The following score of the loss functions, :math:`f_k()`, are available.
@@ -176,30 +186,30 @@ def robust_loss_functions(residual, loss, k, a=None, b=None):
 
     .. math::
 
-        f_k(x) = I(k \pi <= x <= k \pi) \times \sin(x/k)
+        f_k(x) = I(k \pi \le x \le k \pi) \times \sin(x/k)
 
     Huber
 
     .. math::
 
-        f_k(x) = x \times I(-k < x < k) + k \times (1 - I(-k < x < k)) \times \text{sign}(x)
+        f_k(x) = x I(-k < x < k) + \text{sign}(x) k (1 - I(-k < x < k))
 
     Tukey's biweight
 
     .. math::
 
-        f_k(x) = x \times I(-k < x < k) + x \left( 1 - (x/k)^2 \right)^2
+        f_k(x) = x I(-k < x < k) + x \left( 1 - (x/k)^2 \right)^2
 
     Hampel (Hampel's add two additional parameters, :math:`a` and :math:`b`)
 
     .. math::
 
-        f_k(x) =
+        f_{k,a,b}(x) =
         \begin{bmatrix}
             I(-a < x < a) \times x \\
-            + I(a \ge |x| < b) \times a \times \text{sign}(x) \\
-            + I(b \ge x < k) \times a \frac{k - x}{k - b} \\
-            + I(-b \le x > -k) \times -a \frac{-k + x}{-k + b} \\
+            + I(a \le |x| < b) \times a \times \text{sign}(x) \\
+            + I(b \le x < k) \times a \frac{k - x}{k - b} \\
+            + I(-k \ge x > -b) \times -a \frac{-k + x}{-k + b} \\
             + I(|x| \ge k) \times 0
         \end{bmatrix}
 
@@ -315,7 +325,8 @@ def regression_predictions(X, theta, covariance, offset=None, alpha=0.05):
     values. Importantly, this method allows for the variance of :math:`\hat{Y}` to be estimated without having to expand
     the estimating equations. As such, this functionality is meant to be used after ``MEstimator`` has been used to
     estimate the coefficients (i.e., this function is for use after the M-estimator has computed the results for the
-    chosen regression model). Therefore, this function should be viewed as a post-processing functionality.
+    chosen regression model). Therefore, this function should be viewed as a post-processing functionality for
+    generating plots or tables.
 
     Note
     ----
@@ -329,9 +340,9 @@ def regression_predictions(X, theta, covariance, offset=None, alpha=0.05):
         2-dimensional vector of values to generate predicted variances for. The number of columns must match the number
         of coefficients / parameters in ``theta``.
     theta : ndarray
-        Estimated coefficients from ``delicatessen.MEstimator.theta``.
+        Estimated coefficients from ``MEstimator.theta``.
     covariance : ndarray
-        Estimated covariance matrix from ``delicatessen.MEstimator.variance``.
+        Estimated covariance matrix from ``MEstimator.variance``.
     offset : ndarray, None, optional
         A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
     alpha : float, optional
@@ -448,9 +459,10 @@ def spline(variable, knots, power=3, restricted=True, normalized=False):
 
         r_k(X) = I(X > k) \left\{ X - k \right\}^a - s_K(X)
 
-    where :math:`K` is largest knot value. Splines are normalized by the upper knot minus the lower knot to the
-    corresponding power. Normalizing the splines can be helpful for the root-finding procedure, but does change the
-    interpretation of the corresponding coefficients.
+    where :math:`K` is largest knot value.
+
+    Splines are normalized by the upper knot minus the lower knot to the corresponding power. Normalizing the splines
+    can be helpful for the root-finding procedure.
 
     Parameters
     ----------
@@ -546,16 +558,16 @@ def additive_design_matrix(X, specifications, return_penalty=False):
         penalty). For terms that should not have splines, ``None`` should be specified instead (see examples below).
         Each dictionary supports the following parameters:
         "knots", "natural", "power", "penalty"
-        * knots (list): controls the position of the knots, with knots are placed at given locations. There is no
-            default, so must be specified by the user.
-        * natural (bool): controls whether to generate natural (restricted) or unrestricted splines.
-            Default is ``True``, which corresponds to natural splines.
-        * power (float): controls the power to raise the spline terms to. Default is 3, which corresponds to cubic
-            splines.
-        * penalty (float): penalty term (:math:`\lambda`) applied to each corresponding spline basis term. Default is 0,
-            which applies no penalty to the spline basis terms.
-        * normalized (bool): whether to normalize the spline terms. Default is ``False``, with a default change coming
-            with v3.0 release.
+        knots (list): controls the position of the knots, with knots are placed at given locations. There is no
+        default, so must be specified by the user.
+        natural (bool): controls whether to generate natural (restricted) or unrestricted splines.
+        Default is ``True``, which corresponds to natural splines.
+        power (float): controls the power to raise the spline terms to. Default is 3, which corresponds to cubic
+        splines.
+        penalty (float): penalty term (:math:`\lambda`) applied to each corresponding spline basis term. Default is 0,
+        which applies no penalty to the spline basis terms.
+        normalized (bool): whether to normalize the spline terms. Default is ``False``, with a default change coming
+        with v3.0 release.
     return_penalty : bool, optional
         Whether the list of the corresponding penalty terms should also be returned. This functionality is used
         internally to create the list of penalty terms to provide the Ridge regression model, where only the spline
diff --git a/docs/Reference/Utilities.rst b/docs/Reference/Utilities.rst
index e2b5cdb..c0fa9f3 100644
--- a/docs/Reference/Utilities.rst
+++ b/docs/Reference/Utilities.rst
@@ -41,5 +41,5 @@ Differentiation
 .. autosummary::
   :toctree: generated/
 
+  approx_differentiation
   auto_differentiation
-
diff --git a/docs/Reference/generated/delicatessen.derivative.approx_differentiation.rst b/docs/Reference/generated/delicatessen.derivative.approx_differentiation.rst
new file mode 100644
index 0000000..b75e7f2
--- /dev/null
+++ b/docs/Reference/generated/delicatessen.derivative.approx_differentiation.rst
@@ -0,0 +1,6 @@
+﻿delicatessen.derivative.approx\_differentiation
+===============================================
+
+.. currentmodule:: delicatessen.derivative
+
+.. autofunction:: approx_differentiation
\ No newline at end of file