Skip to content

Commit

Permalink
Theil narrative
Browse files Browse the repository at this point in the history
  • Loading branch information
sjsrey committed Jan 4, 2025
1 parent cb10912 commit a20ea23
Show file tree
Hide file tree
Showing 2 changed files with 410 additions and 207 deletions.
352 changes: 267 additions & 85 deletions docs/user-guide/measure/theil.ipynb

Large diffs are not rendered by default.

265 changes: 143 additions & 122 deletions inequality/theil.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
"""
Theil Inequality metrics
"""
"""Theil Inequality metrics"""

__author__ = "Sergio J. Rey <[email protected]>"
__author__ = "Sergio J. Rey <[email protected]> "

import warnings

import numpy as np
import pandas as pd
import numpy

__all__ = ["Theil", "TheilD", "TheilDSim"]

SMALL = np.finfo("float").tiny
SMALL = numpy.finfo("float").tiny


class Theil:
Expand All @@ -28,181 +23,207 @@ class Theil:
Parameters
----------
y : array-like, DataFrame, or sequence
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
If using a DataFrame, specify the column(s) using `column` keyword(s).
column : str, optional
If `y` is a DataFrame, specify the column to be used for the calculation.
y : numpy.array
An array in the shape :math:`(n,t)` or :math:`(n,)`
with :math:`n` taken as the observations across which inequality is
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
determined, one value for each column in ``y``.
Attributes
----------
T : numpy.array
Theil's *T* index.
An array in the shape :math:`(t,)` or :math:`(1,)`
containing Theil's *T* for each column of ``y``.
Notes
-----
The old API (nxT arrays) is deprecated and will be removed in the future.
Use nx1 sequences or DataFrames with a single column instead.
This computation involves natural logs. To prevent ``ln[0]`` from occurring, a
small value is added to each element of ``y`` before beginning the computation.
Examples
--------
>>> import libpysal
>>> import numpy
>>> from inequality.theil import Theil
>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
>>> y = numpy.array([f.by_col[v] for v in vnames]).T
>>> theil_y = Theil(y)
>>> theil_y.T
array([0.20894344, 0.15222451, 0.10472941, 0.10194725, 0.09560113,
0.10511256, 0.10660832])
"""

def __init__(self, y, column=None):
# Deprecation warning for old API
if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1:
warnings.warn(
"The nxT input format is deprecated. In future versions, "
"please provide nx1 sequences or a DataFrame with a single column.",
DeprecationWarning,
stacklevel=2,
)
# Old API behavior
n = y.shape[0]
else:
# New API: Handle sequence or DataFrame
if isinstance(y, pd.DataFrame):
if column is None:
raise ValueError("For DataFrame input, `column` must be specified.")
y = y[column].values
elif isinstance(y, list | np.ndarray):
y = np.asarray(y)
else:
raise TypeError("Input must be an array, list, or DataFrame.")
n = len(y)

# Calculation
def __init__(self, y):
n = len(y)
y = y + SMALL * (y == 0) # can't have 0 values
yt = y.sum(axis=0)
s = y / (yt * 1.0)
lns = np.log(n * s)
lns = numpy.log(n * s)
slns = s * lns
self.T = sum(slns)
t = sum(slns)
self.T = t


class TheilD:
"""
Decomposition of Theil's *T* based on partitioning of
"""Decomposition of Theil's *T* based on partitioning of
observations into exhaustive and mutually exclusive groups.
Parameters
----------
y : array-like, DataFrame, or sequence
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
If using a DataFrame, specify the column(s) using `column` keyword(s).
partition : array-like, DataFrame, or sequence
Partition indicating group membership.
If using a DataFrame, specify the column using `partition_col`.
column : str, optional
If `y` is a DataFrame, specify the column to be used for the calculation.
partition_col : str, optional
If `partition` is a DataFrame, specify the column to be used.
y : numpy.array
An array in the shape :math:`(n,t)` or :math:`(n,)`
with :math:`n` taken as the observations across which inequality is
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
determined, one value for each column in ``y``.
partition : numpy.array
An array in the shape :math:`(n,)` of elements indicating which partition
each observation belongs to. These are assumed to be exhaustive.
Attributes
----------
T : numpy.array
Global Theil's *T*.
An array in the shape :math:`(t,)` or :math:`(1,)`
containing the global inequality *T*.
bg : numpy.array
Between-group inequality.
An array in the shape :math:`(n,t)` or :math:`(n,)`
representing between group inequality.
wg : numpy.array
Within-group inequality.
An array in the shape :math:`(n,t)` or :math:`(n,)`
representing within group inequality.
"""
Examples
--------
def __init__(self, y, partition, column=None, partition_col=None):
# Deprecation warning for old API
if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1:
warnings.warn(
"The nxT input format is deprecated. In future versions, "
"please provide nx1 sequences or a DataFrame with a single column.",
DeprecationWarning,
stacklevel=2,
)
else:
# New API: Handle sequence or DataFrame
if isinstance(y, pd.DataFrame):
if column is None:
raise ValueError("For DataFrame input, `column` must be specified.")
y = y[column].values
elif isinstance(y, list | np.ndarray):
y = np.asarray(y)
else:
raise TypeError("Input must be an array, list, or DataFrame.")

# Handle partition similarly
if isinstance(partition, pd.DataFrame):
if partition_col is None:
raise ValueError(
"For DataFrame input, `partition_col` must be specified."
)
partition = partition[partition_col].values
elif isinstance(partition, list | np.ndarray):
partition = np.asarray(partition)
else:
raise TypeError("Partition must be an array, list, or DataFrame.")
>>> import libpysal
>>> import numpy
>>> from inequality.theil import TheilD
groups = np.unique(partition)
t = Theil(y).T
>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
>>> y = numpy.array([f.by_col[v] for v in vnames]).T
>>> regimes = numpy.array(f.by_col('hanson98'))
>>> theil_d = TheilD(y, regimes)
>>> theil_d.bg
array([0.0345889 , 0.02816853, 0.05260921, 0.05931219, 0.03205257,
0.02963731, 0.03635872])
>>> theil_d.wg
array([0.17435454, 0.12405598, 0.0521202 , 0.04263506, 0.06354856,
0.07547525, 0.0702496 ])
"""

def __init__(self, y, partition):
groups = numpy.unique(partition)
T = Theil(y).T # noqa N806
ytot = y.sum(axis=0)

# Group totals
gtot = np.array([y[partition == gid].sum(axis=0) for gid in groups])
# group totals
gtot = numpy.array([y[partition == gid].sum(axis=0) for gid in groups])

if ytot.size == 1:
if ytot.size == 1: # y is 1-d
sg = gtot / (ytot * 1.0)
sg.shape = (sg.size, 1)
else:
sg = np.dot(gtot, np.diag(1.0 / ytot))
ng = np.array([sum(partition == gid) for gid in groups])
sg = numpy.dot(gtot, numpy.diag(1.0 / ytot))
ng = numpy.array([sum(partition == gid) for gid in groups])
ng.shape = (ng.size,) # ensure ng is 1-d
# Between group inequality
n = y.shape[0]
# between group inequality
sg = sg + (sg == 0) # handle case when a partition has 0 for sum
bg = np.multiply(sg, np.log(np.dot(np.diag(len(y) * 1.0 / ng), sg))).sum(axis=0)
bg = numpy.multiply(sg, numpy.log(numpy.dot(numpy.diag(n * 1.0 / ng), sg))).sum(
axis=0
)

self.T = t
self.T = T
self.bg = bg
self.wg = t - bg
self.wg = T - bg


class TheilDSim:
"""
Random permutation-based inference on Theil's inequality decomposition.
"""Random permutation based inference on Theil's inequality decomposition.
Provides for computationally based inference regarding the inequality
decomposition using random spatial permutations.
See :cite:`rey_interregional_2010`.
Parameters
----------
y : array-like, DataFrame, or sequence
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame.
If using a DataFrame, specify the column(s) using `column` keyword(s).
partition : array-like, DataFrame, or sequence
Partition indicating group membership.
If using a DataFrame, specify the column using `partition_col`.
y : numpy.array
An array in the shape :math:`(n,t)` or :math:`(n,)`
with :math:`n` taken as the observations across which inequality is
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are
determined, one value for each column in ``y``.
partition : numpy.array
An array in the shape :math:`(n,)` of elements indicating which partition
each observation belongs to. These are assumed to be exhaustive.
permutations : int
The number of random spatial permutations for computationally
based inference on the decomposition.
Attributes
----------
observed : numpy.array
An array in the shape :math:`(n,t)` or :math:`(n,)`
representing a ``TheilD`` instance for the observed data.
bg : numpy.array
An array in the shape ``(permutations+1, t)``
representing between group inequality.
bg_pvalue : numpy.array
An array in the shape :math:`(t,1)` representing the :math:`p`-value
for the between group measure. Measures the percentage of the realized
values that were greater than or equal to the observed ``bg`` value.
Includes the observed value.
wg : numpy.array
An array in the shape ``(permutations+1)``
representing within group inequality. Depending on the
shape of ``y``, the array may be 1- or 2-dimensional.
Examples
--------
permutations : int, optional
Number of random permutations for inference (default: 99).
>>> import libpysal
>>> import numpy
>>> from inequality.theil import TheilDSim
column : str, optional
If `y` is a DataFrame, specify the column to be used for the calculation.
>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv'))
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)]
>>> y = numpy.array([f.by_col[v] for v in vnames]).T
>>> regimes = numpy.array(f.by_col('hanson98'))
>>> numpy.random.seed(10)
>>> theil_ds = TheilDSim(y, regimes, 999)
partition_col : str, optional
If `partition` is a DataFrame, specify the column to be used.
>>> theil_ds.bg_pvalue
array([0.4 , 0.344, 0.001, 0.001, 0.034, 0.072, 0.032])
"""

def __init__(self, y, partition, permutations=99, column=None, partition_col=None):
observed = TheilD(y, partition, column=column, partition_col=partition_col)
def __init__(self, y, partition, permutations=99):
observed = TheilD(y, partition)
bg_ct = observed.bg == observed.bg # already have one extreme value
bg_ct = bg_ct * 1.0
results = [observed]
for _ in range(permutations):
yp = np.random.permutation(y)
yp = numpy.random.permutation(y)
t = TheilD(yp, partition)
bg_ct += 1.0 * t.bg >= observed.bg
results.append(t)
self.results = results
self.T = observed.T
self.bg_pvalue = bg_ct / (permutations * 1.0 + 1)
self.bg = np.array([r.bg for r in results])
self.wg = np.array([r.wg for r in results])
self.bg = numpy.array([r.bg for r in results])
self.wg = numpy.array([r.wg for r in results])

0 comments on commit a20ea23

Please sign in to comment.