-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
410 additions
and
207 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,12 @@ | ||
""" | ||
Theil Inequality metrics | ||
""" | ||
"""Theil Inequality metrics""" | ||
|
||
__author__ = "Sergio J. Rey <[email protected]>" | ||
__author__ = "Sergio J. Rey <[email protected]> " | ||
|
||
import warnings | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import numpy | ||
|
||
__all__ = ["Theil", "TheilD", "TheilDSim"] | ||
|
||
SMALL = np.finfo("float").tiny | ||
SMALL = numpy.finfo("float").tiny | ||
|
||
|
||
class Theil: | ||
|
@@ -28,181 +23,207 @@ class Theil: | |
Parameters | ||
---------- | ||
y : array-like, DataFrame, or sequence | ||
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame. | ||
If using a DataFrame, specify the column(s) using `column` keyword(s). | ||
column : str, optional | ||
If `y` is a DataFrame, specify the column to be used for the calculation. | ||
y : numpy.array | ||
An array in the shape :math:`(n,t)` or :math:`(n,)` | ||
with :math:`n` taken as the observations across which inequality is | ||
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is | ||
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are | ||
determined, one value for each column in ``y``. | ||
Attributes | ||
---------- | ||
T : numpy.array | ||
Theil's *T* index. | ||
An array in the shape :math:`(t,)` or :math:`(1,)` | ||
containing Theil's *T* for each column of ``y``. | ||
Notes | ||
----- | ||
The old API (nxT arrays) is deprecated and will be removed in the future. | ||
Use nx1 sequences or DataFrames with a single column instead. | ||
This computation involves natural logs. To prevent ``ln[0]`` from occurring, a | ||
small value is added to each element of ``y`` before beginning the computation. | ||
Examples | ||
-------- | ||
>>> import libpysal | ||
>>> import numpy | ||
>>> from inequality.theil import Theil | ||
>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv')) | ||
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)] | ||
>>> y = numpy.array([f.by_col[v] for v in vnames]).T | ||
>>> theil_y = Theil(y) | ||
>>> theil_y.T | ||
array([0.20894344, 0.15222451, 0.10472941, 0.10194725, 0.09560113, | ||
0.10511256, 0.10660832]) | ||
""" | ||
|
||
def __init__(self, y, column=None): | ||
# Deprecation warning for old API | ||
if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1: | ||
warnings.warn( | ||
"The nxT input format is deprecated. In future versions, " | ||
"please provide nx1 sequences or a DataFrame with a single column.", | ||
DeprecationWarning, | ||
stacklevel=2, | ||
) | ||
# Old API behavior | ||
n = y.shape[0] | ||
else: | ||
# New API: Handle sequence or DataFrame | ||
if isinstance(y, pd.DataFrame): | ||
if column is None: | ||
raise ValueError("For DataFrame input, `column` must be specified.") | ||
y = y[column].values | ||
elif isinstance(y, list | np.ndarray): | ||
y = np.asarray(y) | ||
else: | ||
raise TypeError("Input must be an array, list, or DataFrame.") | ||
n = len(y) | ||
|
||
# Calculation | ||
def __init__(self, y): | ||
n = len(y) | ||
y = y + SMALL * (y == 0) # can't have 0 values | ||
yt = y.sum(axis=0) | ||
s = y / (yt * 1.0) | ||
lns = np.log(n * s) | ||
lns = numpy.log(n * s) | ||
slns = s * lns | ||
self.T = sum(slns) | ||
t = sum(slns) | ||
self.T = t | ||
|
||
|
||
class TheilD: | ||
""" | ||
Decomposition of Theil's *T* based on partitioning of | ||
"""Decomposition of Theil's *T* based on partitioning of | ||
observations into exhaustive and mutually exclusive groups. | ||
Parameters | ||
---------- | ||
y : array-like, DataFrame, or sequence | ||
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame. | ||
If using a DataFrame, specify the column(s) using `column` keyword(s). | ||
partition : array-like, DataFrame, or sequence | ||
Partition indicating group membership. | ||
If using a DataFrame, specify the column using `partition_col`. | ||
column : str, optional | ||
If `y` is a DataFrame, specify the column to be used for the calculation. | ||
partition_col : str, optional | ||
If `partition` is a DataFrame, specify the column to be used. | ||
y : numpy.array | ||
An array in the shape :math:`(n,t)` or :math:`(n,)` | ||
with :math:`n` taken as the observations across which inequality is | ||
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is | ||
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are | ||
determined, one value for each column in ``y``. | ||
partition : numpy.array | ||
An array in the shape :math:`(n,)` of elements indicating which partition | ||
each observation belongs to. These are assumed to be exhaustive. | ||
Attributes | ||
---------- | ||
T : numpy.array | ||
Global Theil's *T*. | ||
An array in the shape :math:`(t,)` or :math:`(1,)` | ||
containing the global inequality *T*. | ||
bg : numpy.array | ||
Between-group inequality. | ||
An array in the shape :math:`(n,t)` or :math:`(n,)` | ||
representing between group inequality. | ||
wg : numpy.array | ||
Within-group inequality. | ||
An array in the shape :math:`(n,t)` or :math:`(n,)` | ||
representing within group inequality. | ||
""" | ||
Examples | ||
-------- | ||
def __init__(self, y, partition, column=None, partition_col=None): | ||
# Deprecation warning for old API | ||
if isinstance(y, np.ndarray) and y.ndim == 2 and y.shape[1] > 1: | ||
warnings.warn( | ||
"The nxT input format is deprecated. In future versions, " | ||
"please provide nx1 sequences or a DataFrame with a single column.", | ||
DeprecationWarning, | ||
stacklevel=2, | ||
) | ||
else: | ||
# New API: Handle sequence or DataFrame | ||
if isinstance(y, pd.DataFrame): | ||
if column is None: | ||
raise ValueError("For DataFrame input, `column` must be specified.") | ||
y = y[column].values | ||
elif isinstance(y, list | np.ndarray): | ||
y = np.asarray(y) | ||
else: | ||
raise TypeError("Input must be an array, list, or DataFrame.") | ||
|
||
# Handle partition similarly | ||
if isinstance(partition, pd.DataFrame): | ||
if partition_col is None: | ||
raise ValueError( | ||
"For DataFrame input, `partition_col` must be specified." | ||
) | ||
partition = partition[partition_col].values | ||
elif isinstance(partition, list | np.ndarray): | ||
partition = np.asarray(partition) | ||
else: | ||
raise TypeError("Partition must be an array, list, or DataFrame.") | ||
>>> import libpysal | ||
>>> import numpy | ||
>>> from inequality.theil import TheilD | ||
groups = np.unique(partition) | ||
t = Theil(y).T | ||
>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv')) | ||
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)] | ||
>>> y = numpy.array([f.by_col[v] for v in vnames]).T | ||
>>> regimes = numpy.array(f.by_col('hanson98')) | ||
>>> theil_d = TheilD(y, regimes) | ||
>>> theil_d.bg | ||
array([0.0345889 , 0.02816853, 0.05260921, 0.05931219, 0.03205257, | ||
0.02963731, 0.03635872]) | ||
>>> theil_d.wg | ||
array([0.17435454, 0.12405598, 0.0521202 , 0.04263506, 0.06354856, | ||
0.07547525, 0.0702496 ]) | ||
""" | ||
|
||
def __init__(self, y, partition): | ||
groups = numpy.unique(partition) | ||
T = Theil(y).T # noqa N806 | ||
ytot = y.sum(axis=0) | ||
|
||
# Group totals | ||
gtot = np.array([y[partition == gid].sum(axis=0) for gid in groups]) | ||
# group totals | ||
gtot = numpy.array([y[partition == gid].sum(axis=0) for gid in groups]) | ||
|
||
if ytot.size == 1: | ||
if ytot.size == 1: # y is 1-d | ||
sg = gtot / (ytot * 1.0) | ||
sg.shape = (sg.size, 1) | ||
else: | ||
sg = np.dot(gtot, np.diag(1.0 / ytot)) | ||
ng = np.array([sum(partition == gid) for gid in groups]) | ||
sg = numpy.dot(gtot, numpy.diag(1.0 / ytot)) | ||
ng = numpy.array([sum(partition == gid) for gid in groups]) | ||
ng.shape = (ng.size,) # ensure ng is 1-d | ||
# Between group inequality | ||
n = y.shape[0] | ||
# between group inequality | ||
sg = sg + (sg == 0) # handle case when a partition has 0 for sum | ||
bg = np.multiply(sg, np.log(np.dot(np.diag(len(y) * 1.0 / ng), sg))).sum(axis=0) | ||
bg = numpy.multiply(sg, numpy.log(numpy.dot(numpy.diag(n * 1.0 / ng), sg))).sum( | ||
axis=0 | ||
) | ||
|
||
self.T = t | ||
self.T = T | ||
self.bg = bg | ||
self.wg = t - bg | ||
self.wg = T - bg | ||
|
||
|
||
class TheilDSim: | ||
""" | ||
Random permutation-based inference on Theil's inequality decomposition. | ||
"""Random permutation based inference on Theil's inequality decomposition. | ||
Provides for computationally based inference regarding the inequality | ||
decomposition using random spatial permutations. | ||
See :cite:`rey_interregional_2010`. | ||
Parameters | ||
---------- | ||
y : array-like, DataFrame, or sequence | ||
Either an `nxT` array (deprecated) or `nx1` sequence or DataFrame. | ||
If using a DataFrame, specify the column(s) using `column` keyword(s). | ||
partition : array-like, DataFrame, or sequence | ||
Partition indicating group membership. | ||
If using a DataFrame, specify the column using `partition_col`. | ||
y : numpy.array | ||
An array in the shape :math:`(n,t)` or :math:`(n,)` | ||
with :math:`n` taken as the observations across which inequality is | ||
calculated. If ``y`` is :math:`(n,)` then a scalar inequality value is | ||
determined. If ``y`` is :math:`(n,t)` then an array of inequality values are | ||
determined, one value for each column in ``y``. | ||
partition : numpy.array | ||
An array in the shape :math:`(n,)` of elements indicating which partition | ||
each observation belongs to. These are assumed to be exhaustive. | ||
permutations : int | ||
The number of random spatial permutations for computationally | ||
based inference on the decomposition. | ||
Attributes | ||
---------- | ||
observed : numpy.array | ||
An array in the shape :math:`(n,t)` or :math:`(n,)` | ||
representing a ``TheilD`` instance for the observed data. | ||
bg : numpy.array | ||
An array in the shape ``(permutations+1, t)`` | ||
representing between group inequality. | ||
bg_pvalue : numpy.array | ||
An array in the shape :math:`(t,1)` representing the :math:`p`-value | ||
for the between group measure. Measures the percentage of the realized | ||
values that were greater than or equal to the observed ``bg`` value. | ||
Includes the observed value. | ||
wg : numpy.array | ||
An array in the shape ``(permutations+1)`` | ||
representing within group inequality. Depending on the | ||
shape of ``y``, the array may be 1- or 2-dimensional. | ||
Examples | ||
-------- | ||
permutations : int, optional | ||
Number of random permutations for inference (default: 99). | ||
>>> import libpysal | ||
>>> import numpy | ||
>>> from inequality.theil import TheilDSim | ||
column : str, optional | ||
If `y` is a DataFrame, specify the column to be used for the calculation. | ||
>>> f = libpysal.io.open(libpysal.examples.get_path('mexico.csv')) | ||
>>> vnames = [f'pcgdp{dec}' for dec in range(1940, 2010, 10)] | ||
>>> y = numpy.array([f.by_col[v] for v in vnames]).T | ||
>>> regimes = numpy.array(f.by_col('hanson98')) | ||
>>> numpy.random.seed(10) | ||
>>> theil_ds = TheilDSim(y, regimes, 999) | ||
partition_col : str, optional | ||
If `partition` is a DataFrame, specify the column to be used. | ||
>>> theil_ds.bg_pvalue | ||
array([0.4 , 0.344, 0.001, 0.001, 0.034, 0.072, 0.032]) | ||
""" | ||
|
||
def __init__(self, y, partition, permutations=99, column=None, partition_col=None): | ||
observed = TheilD(y, partition, column=column, partition_col=partition_col) | ||
def __init__(self, y, partition, permutations=99): | ||
observed = TheilD(y, partition) | ||
bg_ct = observed.bg == observed.bg # already have one extreme value | ||
bg_ct = bg_ct * 1.0 | ||
results = [observed] | ||
for _ in range(permutations): | ||
yp = np.random.permutation(y) | ||
yp = numpy.random.permutation(y) | ||
t = TheilD(yp, partition) | ||
bg_ct += 1.0 * t.bg >= observed.bg | ||
results.append(t) | ||
self.results = results | ||
self.T = observed.T | ||
self.bg_pvalue = bg_ct / (permutations * 1.0 + 1) | ||
self.bg = np.array([r.bg for r in results]) | ||
self.wg = np.array([r.wg for r in results]) | ||
self.bg = numpy.array([r.bg for r in results]) | ||
self.wg = numpy.array([r.wg for r in results]) |