From ee3f002622c8f005d067ec144f89293e8db42523 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Wed, 11 Dec 2024 14:25:30 -0500
Subject: [PATCH] Updating docs for GMM

---
 delicatessen/estimation.py                    | 44 ++----------------
 docs/Basics.rst                               | 45 +++++++++++++------
 docs/Examples/index.rst                       |  5 +--
 docs/Optimization Advice.rst                  | 33 +++++++-------
 docs/Reference/M-Estimator.rst                | 11 ++---
 .../delicatessen.estimation.GMMEstimator.rst  | 27 +++++++++++
 ...=> delicatessen.estimation.MEstimator.rst} |  6 +--
 docs/Reference/index.rst                      | 12 ++---
 docs/conf.py                                  |  3 ++
 docs/index.rst                                | 22 +++++++--
 10 files changed, 115 insertions(+), 93 deletions(-)
 create mode 100644 docs/Reference/generated/delicatessen.estimation.GMMEstimator.rst
 rename docs/Reference/generated/{delicatessen.mestimation.MEstimator.rst => delicatessen.estimation.MEstimator.rst} (72%)

diff --git a/delicatessen/estimation.py b/delicatessen/estimation.py
index e89c67a..9b93bd2 100644
--- a/delicatessen/estimation.py
+++ b/delicatessen/estimation.py
@@ -222,13 +222,6 @@ class MEstimator(_GeneralEstimator):
     :math:`v`-dimensional parameter vector, and :math:`O_i` is the observed data (where units are independent but not
     necessarily identically distributed).
 
-    Note
-    ----
-    Estimating equations are advantageous in both theoretical and applied research. They simplifies proofs of
-    consistency and asymptotic normality of estimators under a large-sample approximation framework. In application,
-    this approach simplifies variance estimation and automates the delta-method.
-
-
     M-Estimators consists of two broad step: point estimation and variance estimation. Point estimation is carried out
     by determining the values of :math:`\theta` where the sum of the estimating equations are zero. This is done via
     standard root-finding algorithms.
@@ -253,22 +246,9 @@ class MEstimator(_GeneralEstimator):
     forward difference method) or forward-mode automatic differentiation. Inverting the bread is done via NumPy's
     ``linalg.pinv``. For the filling, the dot product is taken at :math:`\hat{\theta}`.
 
-    Note
-    ----
-    The difficult part (that must be done by the user) is to specify the estimating equations. Be sure to check the
-    provided examples for the expected format. Pre-built estimating equations for common problems are also made
-    available.
-
-
     After completion of these steps, point and variance estimates are stored. These can be extracted from
     ``MEstimator``. Further, confidence intervals, Z-scores, P-values, or S-values can all be generated.
 
-    Note
-    ----
-    For complex regression problems, the root-finding algorithms are not as robust relative to maximization approaches.
-    A simple solution for difficult problems is to 'pre-wash' or find the solution to the equations and provide those
-    as the initial starting values.
-
     Parameters
     ----------
     stacked_equations : function, callable
@@ -644,12 +624,6 @@ class GMMEstimator(_GeneralEstimator):
     :math:`v`-dimensional parameter vector, and :math:`O_i` is the observed data (where units are independent but not
     necessarily identically distributed).
 
-    Note
-    ----
-    Estimating equations are advantageous in both theoretical and applied research. They simplifies proofs of
-    consistency and asymptotic normality of estimators under a large-sample approximation framework. In application,
-    this approach simplifies variance estimation and automates the delta-method.
-
     Rather than root-finding for the estimating equations, the GMM estimator instead uses a minimization procedure.
     Unlike ``MEstimator``, ``GMMEstimator`` allows for over-identified problems. The general form of the GMM estimator
     is
@@ -657,11 +631,11 @@ class GMMEstimator(_GeneralEstimator):
     .. math::
 
         \text{argmin}_{\theta} \left[ \sum_{i=1}^n \psi(O_i, \hat{\theta}) \right]
-        \text{\textbf{Q}}
+        \text{Q}
         \left[ \sum_{i=1}^n \psi(O_i, \hat{\theta}) \right]
 
 
-    Here, :math:`\text{\textbf{Q}}` is a weight matrix that allows for over-identified (i.e., more parameters than
+    Here, :math:`\text{Q}` is a weight matrix that allows for over-identified (i.e., more parameters than
     estimating functions in :math:`\psi`) problems. Point estimation proceeds by determining the values of
     :math:`\theta` where this equation is minimized. This is done via standard optimization algorithms.
 
@@ -685,21 +659,9 @@ class GMMEstimator(_GeneralEstimator):
     forward difference method) or forward-mode automatic differentiation. Inverting the bread is done via NumPy's
     ``linalg.pinv``. For the filling, the dot product is taken at :math:`\hat{\theta}`.
 
-    Note
-    ----
-    The difficult part (that must be done by the user) is to specify the estimating equations. Be sure to check the
-    provided examples for the expected format. Pre-built estimating equations for common problems are also made
-    available.
-
-
     After completion of these steps, point and variance estimates are stored. These can be extracted from
     ``GMMEstimator``. Further, confidence intervals, Z-scores, P-values, or S-values can all be generated.
 
-    Note
-    ----
-    For complex regression problems, minimization may be difficult. A simple solution for difficult problems is to
-    'pre-wash' or find the solution to the equations and provide those as the initial starting values.
-
     Parameters
     ----------
     stacked_equations : function, callable
@@ -733,7 +695,7 @@ class GMMEstimator(_GeneralEstimator):
     meat : ndarray
         Meat matrix for the parameter vector
     weight_matrix : ndarray
-        Weight matrix, :math:`\text{\textbf{Q}}` used. For just-identified problems, the weight matrix is the identity
+        Weight matrix, :math:`\text{Q}` used. For just-identified problems, the weight matrix is the identity
         matrix
 
     Examples
diff --git a/docs/Basics.rst b/docs/Basics.rst
index a0ad7fe..3cd81dd 100644
--- a/docs/Basics.rst
+++ b/docs/Basics.rst
@@ -1,8 +1,12 @@
 Basics
 =====================================
 
-Here, the basics of M-estimator will be reviewed. An M-estimator, :math:`\hat{\theta}`, is defined as the solution to
-the estimating equation
+Here, the basics of the provided estimation approaches are described.
+
+M-estimator
+-------------------------------
+
+An M-estimator, :math:`\hat{\theta}`, is defined as the solution to the estimating equation
 
 .. math::
 
@@ -13,15 +17,34 @@ where :math:`\psi` is a known :math:`v \times 1`-dimension estimating function,
 :math:`i \in \{1,...,n\}`, and the parameters are the vector :math:`\theta = (\theta_1, \theta_2, ..., \theta_v)`. Note
 that :math:`v` is finite-dimensional and the number of parameters matches the dimension of the estimating functions.
 
-Point Estimation
--------------------------------
-To implement the point estimation of :math:`\theta`, we use a *root-finding* algorithm. Root-finding algorithms are
+In this equation, we use a *root-finding* algorithm to solve for :math:`\theta`. Root-finding algorithms are
 procedures for finding the zeroes (i.e., roots) of an equation. This is accomplished in ``delicatessen`` by using
 SciPy's root-finding algorithms.
 
+GMM-estimator
+-------------------------------
+
+The generalized method of moments (GMM) estimator is instead defined as the solution to
+
+.. math::
+
+    \text{argmin}_{\theta} \left[ \sum_{i=1}^n \psi(O_i, \hat{\theta}) \right]
+        \text{Q}
+        \left[ \sum_{i=1}^n \psi(O_i, \hat{\theta}) \right]
+
+
+Here, :math:`\text{\Q}` is a weight matrix. Note that solving this equation is equivalent to the M-estimator is
+the case when the dimensions of the parameters and estimating functions match. When there are more estimating functions
+than parameters (i.e., over-identified), the M-estimator can no longer be applied but the GMM-estimator can be.
+
+Unlike the M-estimator, we use a minimization algorithm to solve for :math:`\theta`. This is accomplished in
+``delicatessen`` by using SciPy's root-finding algorithms.
+
 Variance Estimation
 -------------------------------
-To estimate the variance for :math:`\theta`, the M-estimator uses the empirical sandwich variance estimator:
+
+Regardless of the chosen point-estimation strategy, the empirical sandwich variance estimator is used to estimate the
+variance for :math:`\theta`:
 
 .. math::
 
@@ -31,9 +54,9 @@ where the 'bread' is
 
 .. math::
 
-    B_n(O,\hat{\theta}) = n^{-1} \sum_{i=1}^n - \psi'(O_i, \hat{\theta})
+    B_n(O,\hat{\theta}) = n^{-1} \sum_{i=1}^n - \nabla \psi(O_i, \hat{\theta})
 
-where the :math:`\psi'` indicates the partial derivative, and the 'filling' is
+where the :math:`\nabla` indicates the partial derivatives, and the 'filling' is
 
 .. math::
 
@@ -41,11 +64,7 @@ where the :math:`\psi'` indicates the partial derivative, and the 'filling' is
 
 The sandwich variance requires finding the derivative of the estimating functions and some matrix algebra. Again, we
 can get the computer to complete all these calculations for us. For the derivative, ``delicatessen`` offers two
-options. First, the derivatives can be numerically approximated using the central difference method. This is done using
-SciPy's ``approx_fprime`` functionality. As of ``v2.0``, the derivatives can also be computed using forward-mode
-automatic differentiation. This approach provides the exact derivative (as opposed to an approximation). This is
-implemented by-hand in ``delicatessen`` via operator overloading. Finally, we use forward-mode because there is no
-time advantage of backward-mode because the Jacobian is square.
+options: numerical approximation or forward-mode automatic differentiation.
 
 Automatic Differentiation Caveats
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/Examples/index.rst b/docs/Examples/index.rst
index dbb9a38..24a27f6 100644
--- a/docs/Examples/index.rst
+++ b/docs/Examples/index.rst
@@ -1,9 +1,8 @@
 Applied Examples
 ==========================
 
-This section provides illustrative examples of application of M-estimators. This includes the replication of examples
-from textbooks and published scientific articles. These examples include built-in estimating equations and use of
-user-built estimating equations.
+This section provides illustrative applications. This includes the replication of examples from textbooks and published
+scientific articles. These examples include built-in estimating equations and use of user-built estimating equations.
 
 
 .. toctree::
diff --git a/docs/Optimization Advice.rst b/docs/Optimization Advice.rst
index 7acf914..8d0b174 100644
--- a/docs/Optimization Advice.rst	
+++ b/docs/Optimization Advice.rst	
@@ -5,12 +5,9 @@ This section is meant to provide some guidance if you have trouble with root-fin
 functions.
 
 A weakness of ``delicatessen`` is that it does not have the fastest or most robust routines for estimating statistical
-parameters (in general maximizing the likelihood is easier computationally than root-finding of the score functions).
-This is the cost of the flexibility of the general M-Estimator (a cost that I believe to be worthwhile). When
-:math:`\theta` only consists of a few parameters, the root-finding will generally be robust. However, cases where
-:math:`\theta` consists of many parameters is likely to occur.
+parameters. This is the cost of the flexibility of the general M-Estimator (a cost that I believe to be worthwhile).
 
-Below are a few recommendations on getting ``MEstimator`` to find the roots of the estimating equations.
+Below are a few recommendations for finding :math:`\theta` with ``MEstimator`` or ``GMMEstimator``.
 
 Center initial values
 ---------------------
@@ -20,21 +17,21 @@ values. However, we don't know what those values may be. Since the root-finding
 best to start it in the 'middle' of the possible space.
 
 As an example, consider the mean estimating equation for a binary variable. The starting value could be specified as
-0, 1, or any other number. However, we can be nice to the root-finding by providing 0.5 as the starting value, since
+0, 1, or any other number. However, we can be nice to the optimizer by providing 0.5 as the starting value, since
 0.5 is the middle of the possible space for a proportion.
 
 For regression, starting values of 0 are likely to be preferred (without additional information about a problem).
 
 If initial values are placed outside of the bounds of a particular :math:`\theta`, this can also break the optimization
-procedure. Returning to the proportion, providing a starting value of -10 may cause the root-finder trouble, since
-proportions are actually bound to [0,1]. So make sure your initial values are (1) reasonable, and (2) within the bounds
-of the measure :math:`\theta`.
+procedure. Returning to the proportion, providing a starting value of -10 may cause trouble, since proportions are
+actually bound to [0,1]. So make sure your initial values are (1) reasonable, and (2) within the bounds of the
+measure :math:`\theta`.
 
 Pre-wash initials
 --------------------
 
 In the case of stacked estimating equations composed of multiple estimating functions (e.g., g-computation, IPW, AIPW),
-some parameters can be estimated indepedent of the others. Then the pre-optimized values can be passed as the initial
+some parameters can be estimated independent of the others. Then the pre-optimized values can be passed as the initial
 values for the overall estimator. This 'pre-washing' of values allows the ``delicatessen`` root-finding to focus on
 values of :math:`\theta` that can't be optimized outside.
 
@@ -42,22 +39,22 @@ This pre-washing approach is particularly useful for regression models, since mo
 for most regression implementations. Pre-washing the initial values allows ``delicatessen`` to 'borrow' the strength of
 more stable methods.
 
-Finally, ``delicatessen`` offers the option to run the root-finding procedure for a subset of the estimating functions.
+Finally, ``delicatessen`` offers the option to run the optimization procedure for a subset of the estimating functions.
 Therefore, some parameters can be solved outside of the procedure and only the remaining subset can be searched for.
 This option is particularly valuable when an estimator consists of hundreds of parameters.
 
 Increase iterations
 --------------------
 
-If neither of those works, increasing the number of iterations is a good next place. By default, ``MEstimator``
-goes to 1000 iterations (far beyond SciPy's default value).
+If neither of those works, increasing the number of iterations is a good next place.
 
-Different root-finding
+Different optimization
 ----------------------
 
-By default, ``delicatessen`` uses the secant method available in ``scipy.optimize.newton``. However, ``delicatessen``
-also supports other algorithms in ``scipy.optimize.root``, such as Levenberg-Marquette and Powell's Hybrid.
-Additionally, manually-specified root-finding algorithms can also be used.
+By default, ``MEstimator`` uses the Levenberg-Marquardt method for root-finding and ``GMMEstimator`` uses the BFGS
+method for minimization. However, ``delicatessen`` also supports other algorithms in ``scipy.optimize``. Additionally,
+manually-specified root-finding algorithms can also be used. These other algorithms may have better operating
+characteristics for some problems.
 
 Non-smooth equations
 --------------------
@@ -65,7 +62,7 @@ As mentioned in the examples, non-smooth estimating equations (e.g., percentiles
 difficult to optimize. In general, it is best to avoid using ``delicatessen`` with non-smooth estimating equations.
 
 If one must use ``delicatessen`` with non-smooth estimating equations, some tricks we have found helpful are to:
-use ``solver='hybr'`` and increasing the tolerance (to the same order as :math:`n`) help.
+increasing the tolerance (to the same order as :math:`n`) or modify the optimization algorithm.
 
 A warning
 -------------------
diff --git a/docs/Reference/M-Estimator.rst b/docs/Reference/M-Estimator.rst
index cbb85c2..ed051b8 100644
--- a/docs/Reference/M-Estimator.rst
+++ b/docs/Reference/M-Estimator.rst
@@ -1,19 +1,20 @@
-M-Estimator
+Estimators
 ===========
-Reference documentation for the M-Estimator available in ``delicatessen``. This is the main utility in the
-``delicatessen`` library. For implementation of your own estimating equations with ``delicatessen``, see the
+Reference documentation for the M-Estimator and GMM-Estimator available in ``delicatessen``. This is the main utility
+in the ``delicatessen`` library. For implementation of your own estimating equations with ``delicatessen``, see the
 documentation and examples provided in the 'Custom Equations' section.
 
 
-M-Estimator
+Estimators
 ---------------------------
 
-.. currentmodule:: delicatessen.mestimation
+.. currentmodule:: delicatessen.estimation
 
 .. autosummary::
    :toctree: generated/
 
    MEstimator
+   GMMEstimator
 
 
 Sandwich Variance Estimator
diff --git a/docs/Reference/generated/delicatessen.estimation.GMMEstimator.rst b/docs/Reference/generated/delicatessen.estimation.GMMEstimator.rst
new file mode 100644
index 0000000..7ce4ec0
--- /dev/null
+++ b/docs/Reference/generated/delicatessen.estimation.GMMEstimator.rst
@@ -0,0 +1,27 @@
+﻿delicatessen.estimation.GMMEstimator
+====================================
+
+.. currentmodule:: delicatessen.estimation
+
+.. autoclass:: GMMEstimator
+
+   
+   .. automethod:: __init__
+
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+   
+      ~GMMEstimator.__init__
+      ~GMMEstimator.confidence_intervals
+      ~GMMEstimator.estimate
+      ~GMMEstimator.p_values
+      ~GMMEstimator.s_values
+      ~GMMEstimator.z_scores
+   
+   
+
+   
+   
+   
\ No newline at end of file
diff --git a/docs/Reference/generated/delicatessen.mestimation.MEstimator.rst b/docs/Reference/generated/delicatessen.estimation.MEstimator.rst
similarity index 72%
rename from docs/Reference/generated/delicatessen.mestimation.MEstimator.rst
rename to docs/Reference/generated/delicatessen.estimation.MEstimator.rst
index 95f20ae..ef4c37e 100644
--- a/docs/Reference/generated/delicatessen.mestimation.MEstimator.rst
+++ b/docs/Reference/generated/delicatessen.estimation.MEstimator.rst
@@ -1,7 +1,7 @@
-﻿delicatessen.mestimation.MEstimator
-===================================
+﻿delicatessen.estimation.MEstimator
+==================================
 
-.. currentmodule:: delicatessen.mestimation
+.. currentmodule:: delicatessen.estimation
 
 .. autoclass:: MEstimator
 
diff --git a/docs/Reference/index.rst b/docs/Reference/index.rst
index 9cac25f..38738a6 100644
--- a/docs/Reference/index.rst
+++ b/docs/Reference/index.rst
@@ -1,17 +1,17 @@
 Reference
 ==========================
 
-Documentation for all available functions and arguments for those functions are provided here. *M-Estimator* contains
-documentation for the general M-Estimator procedure. *Estimating Equations* details the built-in estimating equations
-that come with ``delicatessen``.
+Documentation for all available functions and arguments for those functions are provided here. *Estimators* contains
+documentation for the general M-Estimator and GMM-Estimator procedures. *Estimating Equations* details the built-in
+estimating equations that come with ``delicatessen``.
 
-For a more narrative-driven description of M-Estimation and how to use ``delicatessen``, please see the sections
-provided in the side-bar.
+For a more narrative-driven description of M-Estimation and GMM-Estimation and how to use ``delicatessen``, please see
+the sections provided in the side-bar.
 
 
 .. toctree::
     :maxdepth: 2
 
-    M-Estimator <M-Estimator.rst>
+    Estimators <M-Estimator.rst>
     Estimating Equations <Estimating Equations.rst>
     Utilities <Utilities.rst>
diff --git a/docs/conf.py b/docs/conf.py
index 2ceea4f..9a4a599 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -35,6 +35,7 @@ def get_version():
 extensions = [
     "nbsphinx",
     "sphinx.ext.coverage",
+    'sphinx.ext.inheritance_diagram',
     "sphinx.ext.mathjax",
     "sphinx.ext.autosectionlabel",
     "sphinx.ext.napoleon",
@@ -53,9 +54,11 @@ def get_version():
 
 # generate autosummary pages
 autosummary_generate = True
+autodoc_default_flags = ['members', 'undoc-members', 'inherited-members']
 autodoc_default_options = {
     'members': True,
     'member-order': 'bysource',
+    'autosummary': True,
 }
 
 # General information about the project.
diff --git a/docs/index.rst b/docs/index.rst
index a261d55..187e6af 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -4,10 +4,10 @@ Delicatessen
 =====================================
 
 ``delicatessen`` is a one-stop shop for all your sandwich (variance) needs. This Python 3.8+ library supports
-M-estimation, which is a general statistical framework for estimating unknown parameters. If you are an R user, I
-recommend looking into ``geex`` (`Saul & Hudgens (2020) <https://bsaul.github.io/geex/>`_).
-``delicatessen`` supports a variety of pre-built estimating equations as well as custom, user built estimating
-equations.
+M-estimation and Generalized Method of Moments, which is a general statistical framework for estimating unknown
+parameters. If you are an R user, I recommend looking into ``geex``
+(`Saul & Hudgens (2020) <https://bsaul.github.io/geex/>`_). ``delicatessen`` supports a variety of pre-built estimating
+equations as well as custom, user built estimating equations.
 
 Here, we provide a brief overview of M-Estimation. For a more detailed, please refer to Ross et al. (2024),
 Stefanski & Boos (2002), or Boos & Stefanski (2013). M-estimation was developed to study the large sample properties
@@ -45,6 +45,20 @@ root-finding algorithm. After successful completion of the root-finding, the bre
 approximating the partial derivatives and the filling is calculated. Finally, the empirical sandwich variance is
 computed.
 
+GMM is a related set of methods. The key distinction is the GMM-estimator instead solves
+
+.. math::
+
+    \text{argmin}_{\theta} \left[ \sum_{i=1}^n \psi(O_i, \hat{\theta}) \right]
+        \text{Q}
+        \left[ \sum_{i=1}^n \psi(O_i, \hat{\theta}) \right]
+
+
+where :math:`\text{Q}` is a weight matrix. Unlike the M-estimator, the GMM-estimator allows for the dimension
+of the estimating functions to be larger than the number of parameters (referred to as over-identification). This is
+accomplished through the weight matrix. In the case where the estimating functions and parameters are the same size,
+the M-estimator and GMM-estimator are expected to be equivalent.
+
 Installation:
 -------------