Theil narrative

pysal · Jan 4, 2025 · a20ea23 · a20ea23
1 parent cb10912
commit a20ea23
Show file tree

Hide file tree

Showing 2 changed files with 410 additions and 207 deletions.
diff --git a/docs/user-guide/measure/theil.ipynb b/docs/user-guide/measure/theil.ipynb
diff --git a/inequality/theil.py b/inequality/theil.py
@@ -1,17 +1,12 @@
-"""
-Theil Inequality metrics
-"""
+"""Theil Inequality metrics"""
 
-__author__ = "Sergio J. Rey <[email protected]>"
+__author__ = "Sergio J. Rey <[email protected]> "
 
-import warnings
-
-import numpy as np
-import pandas as pd
+import numpy
 
 __all__ = ["Theil", "TheilD", "TheilDSim"]
 
-SMALL = np.finfo("float").tiny
+SMALL = numpy.finfo("float").tiny
 
 
 class Theil:
@@ -28,181 +23,207 @@ class Theil:
 
     Parameters
     ----------
-    y : array-like, DataFrame, or sequence
-        Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
-        If using a DataFrame, specify the column(s) using `column` keyword(s).
 
-    column : str, optional
-        If `y` is a DataFrame, specify the column to be used for the calculation.
+    y : numpy.array
+        An array in the shape :math:`(n,t)` or :math:`(n,)`
+        with :math:`n` taken as the observations across which inequality is
+        calculated.  If ``y`` is :math:`(n,)` then a scalar inequality value is
+        determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
+        determined, one value for each column in ``y``.
 
     Attributes
     ----------
+
     T : numpy.array
-        Theil's *T* index.
+        An array in the shape :math:`(t,)` or :math:`(1,)`
+        containing Theil's *T* for each column of ``y``.
 
     Notes
     -----
-    The old API (nxT arrays) is deprecated and will be removed in the future.
-    Use nx1 sequences or DataFrames with a single column instead.
+    This computation involves natural logs. To prevent ``ln[0]`` from occurring, a
+    small value is added to each element of ``y`` before beginning the computation.
+
+    Examples
+    --------
+
+    >>> import libpysal
+    >>> import numpy
+    >>> from inequality.theil import Theil
+
+    >>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
+    >>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
+    >>> y = numpy.array([f.by_col[v] for v in vnames]).T
+    >>> theil_y = Theil(y)
+
+    >>> theil_y.T
+    array([0.20894344, 0.15222451, 0.10472941, 0.10194725, 0.09560113,
+           0.10511256, 0.10660832])
 
     """
 
-    def __init__(self, y, column=None):
-        # Deprecation warning for old API
-        if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1:
-            warnings.warn(
-                "The nxT input format is deprecated. In future versions, "
-                "please provide nx1 sequences or a DataFrame with a single column.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            # Old API behavior
-            n = y.shape[0]
-        else:
-            # New API: Handle sequence or DataFrame
-            if isinstance(y, pd.DataFrame):
-                if column is None:
-                    raise ValueError("For DataFrame input, `column` must be specified.")
-                y = y[column].values
-            elif isinstance(y, list | np.ndarray):
-                y = np.asarray(y)
-            else:
-                raise TypeError("Input must be an array, list, or DataFrame.")
-            n = len(y)
-
-        # Calculation
+    def __init__(self, y):
+        n = len(y)
         y = y + SMALL * (y == 0)  # can't have 0 values
         yt = y.sum(axis=0)
         s = y / (yt * 1.0)
-        lns = np.log(n * s)
+        lns = numpy.log(n * s)
         slns = s * lns
-        self.T = sum(slns)
+        t = sum(slns)
+        self.T = t
 
 
 class TheilD:
-    """
-    Decomposition of Theil's *T* based on partitioning of
+    """Decomposition of Theil's *T* based on partitioning of
     observations into exhaustive and mutually exclusive groups.
 
     Parameters
     ----------
-    y : array-like, DataFrame, or sequence
-        Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
-        If using a DataFrame, specify the column(s) using `column` keyword(s).
-
-    partition : array-like, DataFrame, or sequence
-        Partition indicating group membership.
-        If using a DataFrame, specify the column using `partition_col`.
 
-    column : str, optional
-        If `y` is a DataFrame, specify the column to be used for the calculation.
-
-    partition_col : str, optional
-        If `partition` is a DataFrame, specify the column to be used.
+    y : numpy.array
+        An array in the shape :math:`(n,t)` or :math:`(n,)`
+        with :math:`n` taken as the observations across which inequality is
+        calculated.  If ``y`` is :math:`(n,)` then a scalar inequality value is
+        determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
+        determined, one value for each column in ``y``.
+    partition : numpy.array
+        An array in the shape :math:`(n,)` of elements indicating which partition
+        each observation belongs to. These are assumed to be exhaustive.
 
     Attributes
     ----------
+
     T : numpy.array
-        Global Theil's *T*.
+        An array in the shape :math:`(t,)` or :math:`(1,)`
+        containing the global inequality *T*.
     bg : numpy.array
-        Between-group inequality.
+        An array in the shape :math:`(n,t)` or :math:`(n,)`
+        representing between group inequality.
     wg : numpy.array
-        Within-group inequality.
+        An array in the shape :math:`(n,t)` or :math:`(n,)`
+        representing within group inequality.
 
-    """
+    Examples
+    --------
 
-    def __init__(self, y, partition, column=None, partition_col=None):
-        # Deprecation warning for old API
-        if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1:
-            warnings.warn(
-                "The nxT input format is deprecated. In future versions, "
-                "please provide nx1 sequences or a DataFrame with a single column.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-        else:
-            # New API: Handle sequence or DataFrame
-            if isinstance(y, pd.DataFrame):
-                if column is None:
-                    raise ValueError("For DataFrame input, `column` must be specified.")
-                y = y[column].values
-            elif isinstance(y, list | np.ndarray):
-                y = np.asarray(y)
-            else:
-                raise TypeError("Input must be an array, list, or DataFrame.")
-
-        # Handle partition similarly
-        if isinstance(partition, pd.DataFrame):
-            if partition_col is None:
-                raise ValueError(
-                    "For DataFrame input, `partition_col` must be specified."
-                )
-            partition = partition[partition_col].values
-        elif isinstance(partition, list | np.ndarray):
-            partition = np.asarray(partition)
-        else:
-            raise TypeError("Partition must be an array, list, or DataFrame.")
+    >>> import libpysal
+    >>> import numpy
+    >>> from inequality.theil import TheilD
 
-        groups = np.unique(partition)
-        t = Theil(y).T
+    >>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
+    >>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
+    >>> y = numpy.array([f.by_col[v] for v in vnames]).T
+    >>> regimes = numpy.array(f.by_col('hanson98'))
+    >>> theil_d = TheilD(y, regimes)
+
+    >>> theil_d.bg
+    array([0.0345889 , 0.02816853, 0.05260921, 0.05931219, 0.03205257,
+           0.02963731, 0.03635872])
+
+    >>> theil_d.wg
+    array([0.17435454, 0.12405598, 0.0521202 , 0.04263506, 0.06354856,
+           0.07547525, 0.0702496 ])
+
+    """
+
+    def __init__(self, y, partition):
+        groups = numpy.unique(partition)
+        T = Theil(y).T  # noqa N806
         ytot = y.sum(axis=0)
 
-        # Group totals
-        gtot = np.array([y[partition == gid].sum(axis=0) for gid in groups])
+        # group totals
+        gtot = numpy.array([y[partition == gid].sum(axis=0) for gid in groups])
 
-        if ytot.size == 1:
+        if ytot.size == 1:  # y is 1-d
             sg = gtot / (ytot * 1.0)
             sg.shape = (sg.size, 1)
         else:
-            sg = np.dot(gtot, np.diag(1.0 / ytot))
-        ng = np.array([sum(partition == gid) for gid in groups])
+            sg = numpy.dot(gtot, numpy.diag(1.0 / ytot))
+        ng = numpy.array([sum(partition == gid) for gid in groups])
         ng.shape = (ng.size,)  # ensure ng is 1-d
-        # Between group inequality
+        n = y.shape[0]
+        # between group inequality
         sg = sg + (sg == 0)  # handle case when a partition has 0 for sum
-        bg = np.multiply(sg, np.log(np.dot(np.diag(len(y) * 1.0 / ng), sg))).sum(axis=0)
+        bg = numpy.multiply(sg, numpy.log(numpy.dot(numpy.diag(n * 1.0 / ng), sg))).sum(
+            axis=0
+        )
 
-        self.T = t
+        self.T = T
         self.bg = bg
-        self.wg = t - bg
+        self.wg = T - bg
 
 
 class TheilDSim:
-    """
-    Random permutation-based inference on Theil's inequality decomposition.
+    """Random permutation based inference on Theil's inequality decomposition.
+    Provides for computationally based inference regarding the inequality
+    decomposition using random spatial permutations.
+    See :cite:`rey_interregional_2010`.
 
     Parameters
     ----------
-    y : array-like, DataFrame, or sequence
-        Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
-        If using a DataFrame, specify the column(s) using `column` keyword(s).
 
-    partition : array-like, DataFrame, or sequence
-        Partition indicating group membership.
-        If using a DataFrame, specify the column using `partition_col`.
+    y : numpy.array
+        An array in the shape :math:`(n,t)` or :math:`(n,)`
+        with :math:`n` taken as the observations across which inequality is
+        calculated.  If ``y`` is :math:`(n,)` then a scalar inequality value is
+        determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
+        determined, one value for each column in ``y``.
+    partition : numpy.array
+        An array in the shape :math:`(n,)` of elements indicating which partition
+        each observation belongs to. These are assumed to be exhaustive.
+    permutations : int
+        The number of random spatial permutations for computationally
+        based inference on the decomposition.
+
+    Attributes
+    ----------
+
+    observed : numpy.array
+        An array in the shape :math:`(n,t)` or :math:`(n,)`
+        representing a ``TheilD`` instance for the observed data.
+    bg : numpy.array
+        An array in the shape ``(permutations+1, t)``
+        representing between group inequality.
+    bg_pvalue : numpy.array
+        An array in the shape :math:`(t,1)` representing the :math:`p`-value
+        for the between group measure. Measures the percentage of the realized
+        values that were greater than or equal to the observed ``bg`` value.
+        Includes the observed value.
+    wg : numpy.array
+        An array in the shape ``(permutations+1)``
+        representing within group inequality. Depending on the
+        shape of ``y``, the array may be 1- or 2-dimensional.
+
+    Examples
+    --------
 
-    permutations : int, optional
-        Number of random permutations for inference (default: 99).
+    >>> import libpysal
+    >>> import numpy
+    >>> from inequality.theil import TheilDSim
 
-    column : str, optional
-        If `y` is a DataFrame, specify the column to be used for the calculation.
+    >>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
+    >>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
+    >>> y = numpy.array([f.by_col[v] for v in vnames]).T
+    >>> regimes = numpy.array(f.by_col('hanson98'))
+    >>> numpy.random.seed(10)
+    >>> theil_ds = TheilDSim(y, regimes, 999)
 
-    partition_col : str, optional
-        If `partition` is a DataFrame, specify the column to be used.
+    >>> theil_ds.bg_pvalue
+    array([0.4  , 0.344, 0.001, 0.001, 0.034, 0.072, 0.032])
 
     """
 
-    def __init__(self, y, partition, permutations=99, column=None, partition_col=None):
-        observed = TheilD(y, partition, column=column, partition_col=partition_col)
+    def __init__(self, y, partition, permutations=99):
+        observed = TheilD(y, partition)
         bg_ct = observed.bg == observed.bg  # already have one extreme value
         bg_ct = bg_ct * 1.0
         results = [observed]
         for _ in range(permutations):
-            yp = np.random.permutation(y)
+            yp = numpy.random.permutation(y)
             t = TheilD(yp, partition)
             bg_ct += 1.0 * t.bg >= observed.bg
             results.append(t)
         self.results = results
         self.T = observed.T
         self.bg_pvalue = bg_ct / (permutations * 1.0 + 1)
-        self.bg = np.array([r.bg for r in results])
-        self.wg = np.array([r.wg for r in results])
+        self.bg = numpy.array([r.bg for r in results])
+        self.wg = numpy.array([r.wg for r in results])