+[docs]
+defmake_error(rng,n,mu=0,varu=1,method='normal'):
+"""
+ make_error: generate error term for a given distribution
+
+ Arguments:
+ ----------
+ rng: random number object
+ n: number of observations
+ mu: mean (when needed)
+ varu: variance (when needed)
+ method: type of distribution, one of
+ normal, laplace, cauchy, lognormal
+
+ Returns:
+ --------
+ u: nx1 vector of random errors
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> from spreg import make_error
+ >>> rng = np.random.default_rng(12345)
+ >>> make_error(rng,5)
+ array([[-1.42382504],
+ [ 1.26372846],
+ [-0.87066174],
+ [-0.25917323],
+ [-0.07534331]])
+
+ """
+ # normal - standard normal is default
+ ifmethod=='normal':
+ sdu=math.sqrt(varu)
+ u=rng.normal(loc=mu,scale=sdu,size=n).reshape(n,1)
+ # laplace with thicker tails
+ elifmethod=='laplace':
+ sdu=math.sqrt(varu/2.0)
+ u=rng.laplace(loc=mu,scale=sdu,size=n).reshape(n,1)
+ # cauchy, ill-behaved, no mean or variance defined
+ elifmethod=='cauchy':
+ u=rng.standard_cauchy(size=n).reshape(n,1)
+ elifmethod=='lognormal':
+ sdu=math.sqrt(varu)
+ u=rng.lognormal(mean=mu,sigma=sdu,size=n).reshape(n,1)
+ # all other yield warning
+ else:
+ print('Warning: Unsupported distribution')
+ u=None
+ returnu
+
+
+
+[docs]
+defmake_x(rng,n,mu=[0],varu=[1],cor=0,method='uniform'):
+"""
+ make_x: generate a matrix of k columns of x for a given distribution
+
+ Arguments:
+ ----------
+ rng: random number object
+ n: number of observations
+ mu: mean as a list
+ varu: variance as a list
+ cor: correlation as a float (for bivariate normal only)
+ method: type of distribution, one of
+ uniform, normal, bivnormal (bivariate normal)
+
+ Returns:
+ --------
+ x: nxk matrix of x variables
+
+ Note:
+ -----
+ Uniform and normal generate separate draws, bivariate normal generates
+ correlated draws
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> from spreg import make_x
+ >>> rng = np.random.default_rng(12345)
+ >>> make_x(rng,5,mu=[0,1],varu=[1,4])
+ array([[0.78751508, 2.30580253],
+ [1.09728308, 4.14520464],
+ [2.76215497, 1.29373239],
+ [2.3426149 , 4.6609906 ],
+ [1.35484323, 6.52500165]])
+
+ """
+ # check on k dimension
+ k=len(mu)
+ ifk==len(varu):
+ # initialize
+ x=np.zeros((n,k))
+ foriinrange(k):
+ # uniform - range is derived from variance since var = (1/12)range^2
+ # range is found as square root of 12 times variance
+ # for 0-1, varu should be 0.0833333
+ # low is always 0
+ ifmethod=='uniform':
+ sdu=math.sqrt(12.0*varu[i])
+ x[:,i]=rng.uniform(low=0,high=sdu,size=n)
+ # normal - independent normal draws
+ elifmethod=='normal':
+ sdu=math.sqrt(varu[i])
+ x[:,i]=rng.normal(loc=mu[i],scale=sdu,size=n)
+ # bivariate normal - only for k=2
+ elifmethod=='bivnormal':
+ ifk!=2:
+ print('Error: Wrong dimension for k')
+ x=None
+ returnx
+ else:
+ ucov=cor*math.sqrt(varu[0]*varu[1])
+ mcov=[[varu[0],ucov],[ucov,varu[1]]]
+ x=rng.multivariate_normal(mean=mu,cov=mcov,size=n)
+ returnx
+ else:
+ print('Warning: Unsupported distribution')
+ x=None
+ else:
+ x=None
+ returnx
+
+
+
+[docs]
+defmake_wx(x,w,o=1):
+"""
+ make_wx: generate a matrix spatially lagged x given matrix x
+
+ x must be previously generated using make_x, no constant included
+
+ Arguments:
+ ----------
+ x: x matrix - no constant
+ w: row-standardized spatial weights in spreg format
+ o: order of contiguity, default o=1
+
+ Returns:
+ --------
+ wx: nx(kxo) matrix of spatially lagged x variables
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, make_wx
+ >>> rng = np.random.default_rng(12345)
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> make_wx(x,w)[0:5,:]
+ array([[1.12509217],
+ [1.87409079],
+ [1.36225472],
+ [2.1491645 ],
+ [2.80255786]])
+
+ """
+ ifw.n!=x.shape[0]:
+ print("Error: incompatible weights dimensions")
+ returnNone
+ w1x=libpysal.weights.lag_spatial(w,x)
+ wx=w1x
+ ifo>1:
+ foriinrange(1,o):
+ whx=libpysal.weights.lag_spatial(w,w1x)
+ w1x=whx
+ wx=np.hstack((wx,whx))
+ returnwx
+
+
+
+
+[docs]
+defmake_xb(x,beta):
+"""
+ make_xb: generate a column xb as matrix x (constant added)
+ times list beta (includes coefficient for constant term)
+
+ Arguments:
+ ----------
+ x: n x (k-1) matrix for x variables
+ beta: k length list of regression coefficients
+
+ Returns:
+ --------
+ xb: nx1 vector of x times beta
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> from spreg import make_x, make_xb
+ >>> rng = np.random.default_rng(12345)
+ >>> x = make_x(rng,5,mu=[0,1],varu=[1,4])
+ >>> make_xb(x,[1,2,3])
+ array([[ 9.49243776],
+ [15.63018007],
+ [10.4055071 ],
+ [19.66820159],
+ [23.28469141]])
+ """
+ n=x.shape[0]
+ k=x.shape[1]
+ ifk+1!=len(beta):
+ print("Error: Incompatible dimensions")
+ returnNone
+ else:
+ b=np.array(beta)[:,np.newaxis]
+ x1=np.hstack((np.ones((n,1)),x))# include constant
+ xb=np.dot(x1,b)
+ returnxb
+
+
+
+[docs]
+defmake_wxg(wx,gamma):
+"""
+ make_wxg: generate a column wxg as matrix wx (no constant)
+ times list gamma (coefficient of spatially lagged x)
+
+ Arguments:
+ ----------
+ wx: n x ((k-1)xo) matrix for spatially lagged x variables of all orders
+ gamma: (k-1)*o length list of regression coefficients for spatially lagged x
+
+ Returns:
+ --------
+ wxg: nx1 vector of wx times gamma
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, make_wx, make_wxg
+ >>> rng = np.random.default_rng(12345)
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> wx = make_wx(x,w)
+ >>> print(wx.shape)
+ (25, 1)
+ >>> make_wxg(wx,[2,4])[0:5,:]
+ array([[ 2.25018434, 4.50036868],
+ [ 3.74818158, 7.49636316],
+ [ 2.72450944, 5.44901889],
+ [ 4.298329 , 8.59665799],
+ [ 5.60511572, 11.21023145]])
+
+ """
+ k=wx.shape[1]
+ if(k>1):
+ ifk!=len(gamma):
+ print("Error: Incompatible dimensions")
+ returnNone
+ else:
+ g=np.array(gamma)[:,np.newaxis]
+ wxg=np.dot(wx,g)
+ else:# gamma is a scalar
+ wxg=wx*gamma
+ returnwxg
+
+
+
+[docs]
+defdgp_errproc(u,w,lam=0.5,model='sar',imethod='power_exp'):
+"""
+ dgp_errproc: generates pure spatial error process
+
+ Arguments:
+ ----------
+ u: random error vector
+ w: spatial weights object
+ lam: spatial autoregressive parameter
+ model: type of process ('sar' or 'ma')
+ imethod: method for inverse transformation, default = 'power_exp'
+
+ Returns:
+ --------
+ y : vector of observations following a spatial AR or MA process
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, dgp_errproc
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> dgp_errproc(u,w)[0:5,:]
+ array([[-1.43760658],
+ [ 0.69778271],
+ [-0.7750646 ],
+ [-0.47750452],
+ [-0.72377417]])
+
+ """
+ n0=u.shape[0]
+ ifw.n!=n0:
+ print("Error: incompatible weights dimensions")
+ returnNone
+ ifmodel=='sar':
+ y=inverse_prod(w,u,lam,inv_method=imethod)
+ elifmodel=='ma':
+ y=u+lam*libpysal.weights.lag_spatial(w,u)
+ else:
+ print("Error: unsupported model type")
+ returnNone
+ returny
+
+
+
+
+[docs]
+defdgp_ols(u,xb):
+"""
+ dgp_ols: generates y for non-spatial process with given xb and error term u
+
+ Arguments:
+ ----------
+ u: random error vector
+ xb: vector of xb
+
+ Returns:
+ ----------
+ y: vector of observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, dgp_ols
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> dgp_ols(u,xb)[0:5,:]
+ array([[5.22803968],
+ [3.60291127],
+ [1.02632633],
+ [1.37589879],
+ [5.07165754]])
+
+ """
+ n1=u.shape[0]
+ n2=xb.shape[0]
+ ifn1!=n2:
+ print("Error: dimension mismatch")
+ returnNone
+ y=xb+u
+ returny
+
+
+
+[docs]
+defdgp_slx(u,xb,wxg,ybin=False):
+"""
+ dgp_slx: generates y for SLX with given xb, wxg, and error term u
+
+ Arguments:
+ ----------
+ u: random error vector
+ xb: vector of xb
+ wxg: vector of wxg
+ ybin: flag for binary dependent variable, default = False
+
+ Returns:
+ ----------
+ y: vector of observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, make_xb, make_wx, make_wxg, dgp_slx
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> wx = make_wx(x,w)
+ >>> wxg = make_wxg(wx,[2])
+ >>> dgp_slx(u, xb, wxg)[0:5,:]
+ array([[8.85854389],
+ [7.17524694],
+ [3.83674621],
+ [4.73103929],
+ [8.37023076]])
+ """
+ n0=u.shape[0]
+ n1=xb.shape[0]
+ n2=wxg.shape[0]
+ ifn0!=n1:
+ print("Error: dimension mismatch")
+ returnNone
+ elifn1!=n2:
+ print("Error: dimension mismatch")
+ returnNone
+ y=xb+wxg+u
+ ifybin:# probit case, turn into 0-1
+ y=make_bin(y)
+ returny
+
+
+
+[docs]
+defdgp_sperror(u,xb,w,lam=0.5,model='sar',imethod='power_exp',ybin=False):
+"""
+ dgp_sperror: generates y for spatial error model with given xb, weights,
+ spatial parameter lam, error term, method for inverse transform
+
+ Arguments:
+ ----------
+ u: random error
+ xb: vector of xb
+ w: spatial weights
+ lam: spatial coefficient
+ model: type of process ('sar' or 'ma')
+ imethod: method for inverse transformation, default = 'power_exp'
+ ybin: flag for binary dependent variable
+
+ Returns:
+ ----------
+ y: vector of observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, make_xb, dgp_sperror
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> dgp_sperror(u, xb, w)[0:5,:]
+ array([[5.21425813],
+ [3.03696553],
+ [1.12192347],
+ [1.15756751],
+ [4.42322667]])
+ """
+ n0=u.shape[0]
+ n1=xb.shape[0]
+ ifn0!=n1:
+ print("Error: incompatible weights dimensions")
+ returnNone
+ elifw.n!=n1:
+ print("Error: incompatible weights dimensions")
+ returnNone
+ ifmodel=='sar':
+ u1=inverse_prod(w,u,lam,inv_method=imethod)
+ elifmodel=='ma':
+ u1=u+lam*libpysal.weights.lag_spatial(w,u)
+ else:
+ print("Error: unsupported model type")
+ returnNone
+ y=xb+u1
+ ifybin:
+ y=make_bin(y)
+ returny
+
+
+
+[docs]
+defdgp_slxerror(u,xb,wxg,w,lam=0.5,model='sar',imethod='power_exp',ybin=False):
+"""
+ dgp_slxerror: generates y for SLX spatial error model with xb, wxg, weights,
+ spatial parameter lam, model type (sar or ma),
+ error term, method for inverse transform
+
+ Arguments:
+ ----------
+ u: random error
+ xb: vector of xb
+ wxg: vector of wxg
+ w: spatial weights
+ lam: spatial coefficient
+ model: type of process ('sar' or 'ma')
+ imethod: method for inverse transformation, default = 'power_exp'
+ ybin: flag for binary 0-1 dependent variable
+
+ Returns:
+ ----------
+ y: vector of observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, make_xb, make_wx, make_wxg, dgp_slxerror
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> wx = make_wx(x,w)
+ >>> wxg = make_wxg(wx,[2])
+ >>> dgp_slxerror(u,xb,wxg,w)[0:5,:]
+ array([[8.84476235],
+ [6.6093012 ],
+ [3.93234334],
+ [4.51270801],
+ [7.7217999 ]])
+ """
+
+ n0=u.shape[0]
+ n1=xb.shape[0]
+ n2=wxg.shape[0]
+ ifn0!=n1:
+ print("Error: dimension mismatch")
+ returnNone
+ elifn1!=n2:
+ print("Error: dimension mismatch")
+ returnNone
+ ifw.n!=n1:
+ print("Error: incompatible weights dimensions")
+ returnNone
+ ifmodel=='sar':
+ u1=inverse_prod(w,u,lam,inv_method=imethod)
+ elifmodel=='ma':
+ u1=u+lam*libpysal.weights.lag_spatial(w,u)
+ else:
+ print("Error: unsupported model type")
+ returnNone
+ y=xb+wxg+u1
+ ifybin:
+ y=make_bin(y)
+ returny
+[docs]
+defdgp_lagerr(u,xb,w,rho=0.5,lam=0.2,model='sar',imethod='power_exp',ybin=False):
+"""
+ dgp_lagerr: generates y for spatial lag model with sar or ma errors
+ with xb, weights,
+ spatial parameter rho, spatial parameter lambda,
+ model for spatial process,
+ error term, method for inverse transform
+
+ Arguments:
+ ----------
+ u: random error
+ xb: vector of xb
+ w: spatial weights
+ rho: spatial coefficient for lag
+ lam: spatial coefficient for error
+ model: spatial process for error
+ imethod: method for inverse transformation, default = 'power_exp'
+ ybin: flag for binary 0-1 dependent variable
+
+ Returns:
+ ----------
+ y: vector of observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, make_xb, dgp_lagerr
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> dgp_lagerr(u, xb, w)[0:5,:]
+ array([[10.13845523],
+ [ 7.53009531],
+ [ 5.40644034],
+ [ 5.51132886],
+ [ 8.58872366]])
+ """
+ n0=u.shape[0]
+ n1=xb.shape[0]
+ ifn0!=n1:
+ print("Error: dimension mismatch")
+ returnNone
+ ifw.n!=n1:
+ print("Error: incompatible weights dimensions")
+ returnNone
+ ifmodel=='sar':
+ u1=inverse_prod(w,u,lam,inv_method=imethod)
+ elifmodel=='ma':
+ u1=u+lam*libpysal.weights.lag_spatial(w,u)
+ else:
+ print("Error: unsupported model type")
+ returnNone
+ y1=xb+u1
+ y=inverse_prod(w,y1,rho,inv_method=imethod)
+ ifybin:
+ y=make_bin(y)
+ returny
+
+
+
+[docs]
+defdgp_gns(u,xb,wxg,w,rho=0.5,lam=0.2,model='sar',imethod='power_exp',ybin=False):
+"""
+ dgp_gns: generates y for general nested model with sar or ma errors
+ with xb, wxg, weights,
+ spatial parameter rho, spatial parameter lambda,
+ model for spatial process,
+ error term, method for inverse transform
+
+ Arguments:
+ ----------
+ u: random error
+ xb: vector of xb
+ wxg: vector of wxg
+ w: spatial weights
+ rho: spatial coefficient for lag
+ lam: spatial coefficient for error
+ model: spatial process for error
+ imethod: method for inverse transformation, default = 'power_exp'
+ ybin: flag for binary 0-1 dependent variable
+
+ Returns:
+ ----------
+ y: vector of observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, make_xb, make_wx, make_wxg, dgp_gns
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> wx = make_wx(x,w)
+ >>> wxg = make_wxg(wx,[2])
+ >>> dgp_gns(u,xb,wxg,w)[0:5,:]
+ array([[18.04158549],
+ [14.96336153],
+ [11.95902806],
+ [12.44108728],
+ [15.47860632]])
+ """
+ n0=u.shape[0]
+ n1=xb.shape[0]
+ n2=wxg.shape[0]
+ ifn0!=n1:
+ print("Error: dimension mismatch")
+ returnNone
+ elifn1!=n2:
+ print("Error: dimension mismatch")
+ returnNone
+ ifw.n!=n1:
+ print("Error: incompatible weights dimensions")
+ ifmodel=='sar':
+ u1=inverse_prod(w,u,lam,inv_method=imethod)
+ elifmodel=='ma':
+ u1=u+lam*libpysal.weights.lag_spatial(w,u)
+ else:
+ print("Error: unsupported model type")
+ returnNone
+ y1=xb+wxg+u1
+ y=inverse_prod(w,y1,rho,inv_method=imethod)
+ ifybin:
+ y=make_bin(y)
+ returny
+
+
+
+[docs]
+defdgp_mess(u,xb,w,rho=0.5):
+"""
+ dgp_mess: generates y for MESS spatial lag model with xb, weights,
+ spatial parameter rho (gets converted into alpha),
+ sigma/method for the error term
+
+ Arguments:
+ ----------
+ u: random error
+ xb: vector of xb
+ w: spatial weights
+ rho: spatial coefficient (converted into alpha)
+
+ Returns:
+ ----------
+ y: vector of observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, make_xb, dgp_mess
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> w = libpysal.weights.lat2W(5, 5)
+ >>> w.transform = "r"
+ >>> dgp_mess(u, xb, w)[0:5,:]
+ array([[10.12104421],
+ [ 7.45561055],
+ [ 5.32807674],
+ [ 5.55549492],
+ [ 8.62685145]])
+ """
+ n0=u.shape[0]
+ n1=xb.shape[0]
+ ifn0!=n1:
+ print("Error: dimension mismatch")
+ returnNone
+ ifw.n!=n1:
+ print("Error: incompatible weights dimensions")
+ returnNone
+ bigw=libpysal.weights.full(w)[0]
+ alpha=np.log(1-rho)#convert between rho and alpha
+ aw=-alpha*bigw# inverse exponential is -alpha
+ xbu=xb+u
+ y=np.dot(expm(aw),xbu)
+ returny
+
+
+
+[docs]
+defdgp_probit(u,xb):
+"""
+ dgp_probit: generates y for non-spatial probit process with given xb
+ and error u
+ y is taken when linear model prediction > its mean
+
+ Arguments:
+ ----------
+ u: random error vector
+ xb: vector of xb
+
+ Returns:
+ ----------
+ y: vector of observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, dgp_probit
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> dgp_probit(u,xb)[0:5,:]
+ array([[1],
+ [0],
+ [0],
+ [0],
+ [1]])
+
+ """
+ n1=u.shape[0]
+ n2=xb.shape[0]
+ ifn1!=n2:
+ print("Error: dimension mismatch")
+ returnNone
+ yy=xb+u
+ mm=yy.mean()
+ y=(yy>mm)*1
+ returny
+
+
+
+[docs]
+defmake_bin(yy):
+"""
+ make_bin: generates y as 0-1 variable for y > mean of y,
+ i.e., for discrete dependent variables
+
+ Arguments:
+ ----------
+ yy: initial dependent variable vector
+
+ Returns:
+ ----------
+ y: vector of 0-1 observations on dependent variable
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import make_x, dgp_ols, dgp_pbit
+ >>> rng = np.random.default_rng(12345)
+ >>> u = make_x(rng,25,mu=[0],varu=[1], method='normal')
+ >>> x = make_x(rng,25,mu=[0],varu=[1])
+ >>> xb = make_xb(x,[1,2])
+ >>> yy = dgp_ols(u,xb)
+ >>> dgp_pbit(yy)[0:5,:]
+ array([[1],
+ [0],
+ [0],
+ [0],
+ [1]])
+
+ """
+ mm=yy.mean()
+ y=(yy>mm)
+ returny
+
+
+
+
+[docs]
+defmake_heterror(u,v):
+"""
+ make_heterror: transforms constant variance error term into
+ a heteroskedastic error vector
+
+ Arguments:
+ ----------
+ u: random error vector (constant variance assumed)
+ v: matching vector of variance multipliers
+ if variance of u is 1, then variance of result with be v
+
+ Returns:
+ --------
+ hu: matching vector with heteroskedastic errors
+
+ """
+ n0=u.shape[0]
+ n1=v.shape[0]
+ ifn0!=n1:
+ print("Error: incompatible vector dimensions")
+ returnNone
+ # multiply error with square root of variance multiplier v
+ sev=np.sqrt(v)
+ hu=u*sev
+ returnhu
+
+
+
+[docs]
+defmake_vmult(n,method="linear",vlow=[1],vup=[4],rng=None):
+
+"""
+ make_vmult : helper function to create variance multiplier
+ for use in heteroskedastic errors (dgp_heterror)
+
+ Arguments:
+ ----------
+ n: number of observations
+ method: type of multiplier
+ linear: linear interpolation between vlow and vup
+ group: groupwise heteroskedasticity of vup by group vlow
+ uniform: uniform random number between vlow and vup
+ vlow: list with lower variance multiplier (default=1) or group indicators
+ group indicators give number of elements in each group, must add up to n
+ vup: list with upper variance multiplier (default=4) or group variance multipliers
+ list of group variance multipliers must match length of vlow
+ rng: random number object (required for method=uniform, otherwise None)
+
+ Returns:
+ --------
+ v: nx1 vector with variance multipliers
+
+ """
+ v=np.ones((n,1))
+ ifmethod=="uniform":
+ ifnot(rng==None):
+ v=rng.uniform(low=vlow[0],high=vup[0],size=n)
+ v=v.reshape(-1,1)
+ else:
+ print("Error: Missing random number object")
+ returnNone
+ elifmethod=="linear":
+ vlo=vlow[0]
+ slop=(vup[0]-vlow[0])/(n-1)
+ foriinrange(n):
+ v[i,0]=vlo+slop*i
+ elifmethod=="group":
+ iflen(vup)==len(vlow)andsum(vlow)==n:
+ h=[[i]*jfori,jinzip(vup,vlow)]# one sublist for each group
+ # flatten list and convert to numpy array
+ hh=[]
+ foriiinh:
+ hh+=ii
+ v=np.array(hh)
+ v=v.reshape(-1,1)
+ else:
+ print("Error: Incompatible dimensions")
+ v=None
+ else:
+ print('Error: Unsupported method')
+ v=None
+ returnv
+[docs]
+deff_stat(reg,df=0):
+"""
+ Calculates the f-statistic and associated p-value for multiple
+ coefficient constraints :cite:`Greene2003`.
+ (For two stage least squares see f_stat_tsls)
+ (default is F statistic for regression)
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+ df : number of coefficient constraints
+ (zero constraint for last df coefficients in betas)
+
+ Returns
+ ----------
+ fs_result : tuple
+ includes value of F statistic and associated p-value
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the F-statistic for the regression.
+
+ >>> testresult = spreg.f_stat(reg)
+
+ Print the results tuple, including the statistic and its significance.
+
+ >>> print("%12.12f"%testresult[0],"%12.12f"%testresult[1])
+ 28.385629224695 0.000000009341
+
+ """
+ k=reg.k# (scalar) number of ind. vars (includes constant)
+ n=reg.n# (scalar) number of observations
+ utu=reg.utu# (scalar) residual sum of squares
+ # default case, all coefficients
+ ifdf==0:
+ r=k-1
+ predy=reg.predy# (array) vector of predicted values (n x 1)
+ mean_y=reg.mean_y# (scalar) mean of dependent observations
+ U=np.sum((predy-mean_y)**2)
+ else:# F test on last df coefficients
+ y=reg.y
+ r=df
+ x0=reg.x[:,:-r]
+ olsr=BaseOLS(y,x0)# constrained regression
+ rtr=olsr.utu
+ U=rtr-utu
+ fStat=(U/r)/(utu/(n-k))
+ pValue=stats.f.sf(fStat,r,n-k)
+ fs_result=(fStat,pValue)
+ returnfs_result
+
+
+
+deft_stat(reg,z_stat=False):
+"""
+ Calculates the t-statistics (or z-statistics) and associated
+ p-values. :cite:`Greene2003`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+ z_stat : boolean
+ If True run z-stat instead of t-stat
+
+ Returns
+ -------
+ ts_result : list of tuples
+ each tuple includes value of t statistic (or z
+ statistic) and associated p-value
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate t-statistics for the regression coefficients.
+
+ >>> testresult = spreg.t_stat(reg)
+
+ Print the tuples that contain the t-statistics and their significances.
+
+ >>> print("%12.12f"%testresult[0][0], "%12.12f"%testresult[0][1], "%12.12f"%testresult[1][0], "%12.12f"%testresult[1][1], "%12.12f"%testresult[2][0], "%12.12f"%testresult[2][1])
+ 14.490373143689 0.000000000000 -4.780496191297 0.000018289595 -2.654408642718 0.010874504910
+ """
+
+ k=reg.k# (scalar) number of ind. vars (includes constant)
+ n=reg.n# (scalar) number of observations
+ vm=reg.vm# (array) coefficients of variance matrix (k x k)
+ betas=reg.betas# (array) coefficients of the regressors (1 x k)
+ variance=vm.diagonal()
+ tStat=betas[list(range(0,len(vm)))].reshape(
+ len(vm),
+ )/np.sqrt(variance)
+ ts_result=[]
+ fortintStat:
+ ifz_stat:
+ ts_result.append((t,stats.norm.sf(abs(t))*2))
+ else:
+ ts_result.append((t,stats.t.sf(abs(t),n-k)*2))
+ returnts_result
+
+
+
+[docs]
+defr2(reg):
+"""
+ Calculates the R^2 value for the regression. :cite:`Greene2003`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ ----------
+ r2_result : float
+ value of the coefficient of determination for the
+ regression
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the R^2 value for the regression.
+
+ >>> testresult = spreg.r2(reg)
+
+ Print the result.
+
+ >>> print("%1.8f"%testresult)
+ 0.55240404
+
+ """
+ y=reg.y# (array) vector of dep observations (n x 1)
+ mean_y=reg.mean_y# (scalar) mean of dep observations
+ utu=reg.utu# (scalar) residual sum of squares
+ ss_tot=((y-mean_y)**2).sum(0)
+ r2=1-utu/ss_tot
+ r2_result=r2[0]
+ returnr2_result
+
+
+
+
+[docs]
+defar2(reg):
+"""
+ Calculates the adjusted R^2 value for the regression. :cite:`Greene2003`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ ----------
+ ar2_result : float
+ value of R^2 adjusted for the number of explanatory
+ variables.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the adjusted R^2 value for the regression.
+ >>> testresult = spreg.ar2(reg)
+
+ Print the result.
+
+ >>> print("%1.8f"%testresult)
+ 0.53294335
+
+ """
+ k=reg.k# (scalar) number of ind. variables (includes constant)
+ n=reg.n# (scalar) number of observations
+ ar2_result=1-(1-r2(reg))*(n-1)/(n-k)
+ returnar2_result
+
+
+
+
+[docs]
+defse_betas(reg):
+"""
+ Calculates the standard error of the regression coefficients. :cite:`Greene2003`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ ----------
+ se_result : array
+ includes standard errors of each coefficient (1 x k)
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the standard errors of the regression coefficients.
+
+ >>> testresult = spreg.se_betas(reg)
+
+ Print the vector of standard errors.
+
+ >>> testresult
+ array([4.73548613, 0.33413076, 0.10319868])
+
+ """
+ vm=reg.vm# (array) coefficients of variance matrix (k x k)
+ variance=vm.diagonal()
+ se_result=np.sqrt(variance)
+ returnse_result
+
+
+
+
+[docs]
+deflog_likelihood(reg):
+"""
+ Calculates the log-likelihood value for the regression. :cite:`Greene2003`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ -------
+ ll_result : float
+ value for the log-likelihood of the regression.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the log-likelihood for the regression.
+
+ >>> testresult = spreg.log_likelihood(reg)
+
+ Print the result.
+
+ >>> testresult
+ -187.3772388121491
+
+ """
+ n=reg.n# (scalar) number of observations
+ utu=reg.utu# (scalar) residual sum of squares
+ ll_result=-0.5*(n*(np.log(2*pi))+n*np.log(utu/n)+(utu/(utu/n)))
+ returnll_result
+
+
+
+
+[docs]
+defakaike(reg):
+"""
+ Calculates the Akaike Information Criterion. :cite:`Akaike1974`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ -------
+ aic_result : scalar
+ value for Akaike Information Criterion of the
+ regression.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the Akaike Information Criterion (AIC).
+
+ >>> testresult = spreg.akaike(reg)
+
+ Print the result.
+
+ >>> testresult
+ 380.7544776242982
+
+ """
+ k=reg.k# (scalar) number of explanatory vars (including constant)
+ try:# ML estimation, logll already exists
+ # spatial coefficient included in k
+ aic_result=2.0*k-2.0*reg.logll
+ exceptAttributeError:# OLS case
+ n=reg.n# (scalar) number of observations
+ utu=reg.utu# (scalar) residual sum of squares
+ aic_result=2*k+n*(np.log((2*np.pi*utu)/n)+1)
+ returnaic_result
+
+
+
+
+[docs]
+defschwarz(reg):
+"""
+ Calculates the Schwarz Information Criterion. :cite:`Schwarz1978`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ -------
+ bic_result : scalar
+ value for Schwarz (Bayesian) Information Criterion of
+ the regression.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the Schwarz Information Criterion.
+
+ >>> testresult = spreg.schwarz(reg)
+
+ Print the results.
+
+ >>> np.round(testresult, 5)
+ 386.42994
+
+ """
+ n=reg.n# (scalar) number of observations
+ k=reg.k# (scalar) number of ind. variables (including constant)
+ try:# ML case logll already computed
+ # spatial coeff included in k
+ sc_result=k*np.log(n)-2.0*reg.logll
+ exceptAttributeError:# OLS case
+ utu=reg.utu# (scalar) residual sum of squares
+ sc_result=k*np.log(n)+n*(np.log((2*np.pi*utu)/n)+1)
+ returnsc_result
+
+
+
+
+[docs]
+defcondition_index(reg):
+"""
+ Calculates the multicollinearity condition index according to Belsey,
+ Kuh and Welsh (1980) :cite:`Belsley1980`.
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ -------
+ ci_result : float
+ scalar value for the multicollinearity condition
+ index.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the condition index to check for multicollinearity.
+
+ >>> testresult = spreg.condition_index(reg)
+
+ Print the result.
+
+ >>> print("%1.3f"%testresult)
+ 6.542
+
+ """
+ ifhasattr(reg,"xtx"):
+ xtx=reg.xtx# (array) k x k projection matrix (includes constant)
+ elifhasattr(reg,"hth"):
+ xtx=reg.hth# (array) k x k projection matrix (includes constant)
+ diag=np.diagonal(xtx)
+ scale=xtx/diag
+ eigval=np.linalg.eigvals(scale)
+ max_eigval=max(eigval)
+ min_eigval=min(eigval)
+ ci_result=sqrt(max_eigval/min_eigval)
+ returnci_result
+
+
+
+
+[docs]
+defjarque_bera(reg):
+"""
+ Jarque-Bera test for normality in the residuals. :cite:`Jarque1980`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ -------
+ jb_result : dictionary
+ contains the statistic (jb) for the Jarque-Bera test
+ and the associated p-value (p-value)
+ df : integer
+ degrees of freedom for the test (always 2)
+ jb : float
+ value of the test statistic
+ pvalue : float
+ p-value associated with the statistic (chi^2
+ distributed with 2 df)
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"), "r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the Jarque-Bera test for normality of residuals.
+
+ >>> testresult = spreg.jarque_bera(reg)
+
+ Print the degrees of freedom for the test.
+
+ >>> testresult['df']
+ 2
+
+ Print the test statistic.
+
+ >>> print("%1.3f"%testresult['jb'])
+ 1.836
+
+ Print the associated p-value.
+
+ >>> print("%1.4f"%testresult['pvalue'])
+ 0.3994
+
+ """
+ n=reg.n# (scalar) number of observations
+ u=reg.u# (array) residuals from regression
+ u2=u**2
+ u3=u**3
+ u4=u**4
+ mu2=np.mean(u2)
+ mu3=np.mean(u3)
+ mu4=np.mean(u4)
+ S=mu3/(mu2**(1.5))# skewness measure
+ K=mu4/(mu2**2)# kurtosis measure
+ jb=n*(((S**2)/6)+((K-3)**2)/24)
+ pvalue=stats.chisqprob(jb,2)
+ jb_result={"df":2,"jb":jb,"pvalue":pvalue}
+ returnjb_result
+
+
+
+
+[docs]
+defbreusch_pagan(reg,z=None):
+"""
+ Calculates the Breusch-Pagan test statistic to check for
+ heteroscedasticity. :cite:`Breusch1979`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+ z : array
+ optional input for specifying an alternative set of
+ variables (Z) to explain the observed variance. By
+ default this is a matrix of the squared explanatory
+ variables (X**2) with a constant added to the first
+ column if not already present. In the default case,
+ the explanatory variables are squared to eliminate
+ negative values.
+
+ Returns
+ -------
+ bp_result : dictionary
+ contains the statistic (bp) for the test and the
+ associated p-value (p-value)
+ bp : float
+ scalar value for the Breusch-Pagan test statistic
+ df : integer
+ degrees of freedom associated with the test (k)
+ pvalue : float
+ p-value associated with the statistic (chi^2
+ distributed with k df)
+
+ Notes
+ -----
+ x attribute in the reg object must have a constant term included. This is
+ standard for spreg.OLS so no testing done to confirm constant.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"), "r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the Breusch-Pagan test for heteroscedasticity.
+
+ >>> testresult = spreg.breusch_pagan(reg)
+
+ Print the degrees of freedom for the test.
+
+ >>> testresult['df']
+ 2
+
+ Print the test statistic.
+
+ >>> print("%1.3f"%testresult['bp'])
+ 7.900
+
+ Print the associated p-value.
+
+ >>> print("%1.4f"%testresult['pvalue'])
+ 0.0193
+
+ """
+ e2=reg.u**2
+ e=reg.u
+ n=reg.n
+ k=reg.k
+ ete=reg.utu
+
+ den=ete/n
+ g=e2/den-1.0
+
+ ifz==None:
+ x=reg.x
+ # constant = constant_check(x)
+ # if constant == False:
+ # z = np.hstack((np.ones((n,1)),x))**2
+ # else:
+ # z = x**2
+ z=spmultiply(x,x)
+ else:
+ # constant = constant_check(z)
+ # if constant == False:
+ # z = np.hstack((np.ones((n,1)),z))
+ pass
+
+ n,p=z.shape
+
+ # Check to identify any duplicate columns in Z
+ omitcolumn=[]
+ foriinrange(p):
+ current=z[:,i]
+ forjinrange(p):
+ check=z[:,j]
+ ifi<j:
+ test=abs(current-check).sum()
+ iftest==0:
+ omitcolumn.append(j)
+
+ uniqueomit=set(omitcolumn)
+ omitcolumn=list(uniqueomit)
+
+ # Now the identified columns must be removed (done in reverse to
+ # prevent renumbering)
+ omitcolumn.sort()
+ omitcolumn.reverse()
+ forcinomitcolumn:
+ z=np.delete(z,c,1)
+ n,p=z.shape
+
+ df=p-1
+
+ # Now that the variables are prepared, we calculate the statistic
+ zt=np.transpose(z)
+ gt=np.transpose(g)
+ gtz=np.dot(gt,z)
+ ztg=np.dot(zt,g)
+ ztz=np.dot(zt,z)
+ ztzi=la.inv(ztz)
+
+ part1=np.dot(gtz,ztzi)
+ part2=np.dot(part1,ztg)
+ bp_array=0.5*part2
+ bp=bp_array[0,0]
+
+ pvalue=stats.chisqprob(bp,df)
+ bp_result={"df":df,"bp":bp,"pvalue":pvalue}
+ returnbp_result
+
+
+
+
+[docs]
+defwhite(reg):
+"""
+ Calculates the White test to check for heteroscedasticity. :cite:`White1980`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ -------
+ white_result : dictionary
+ contains the statistic (white), degrees of freedom
+ (df) and the associated p-value (pvalue) for the
+ White test.
+ white : float
+ scalar value for the White test statistic.
+ df : integer
+ degrees of freedom associated with the test
+ pvalue : float
+ p-value associated with the statistic (chi^2
+ distributed with k df)
+
+ Notes
+ -----
+ x attribute in the reg object must have a constant term included. This is
+ standard for spreg.OLS so no testing done to confirm constant.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the White test for heteroscedasticity.
+
+ >>> testresult = spreg.white(reg)
+
+ Print the degrees of freedom for the test.
+
+ >>> print(testresult['df'])
+ 5
+
+ Print the test statistic.
+
+ >>> print("%1.3f"%testresult['wh'])
+ 19.946
+
+ Print the associated p-value.
+
+ >>> print("%1.4f"%testresult['pvalue'])
+ 0.0013
+
+ """
+ e=reg.u**2
+ k=int(reg.k)
+ n=int(reg.n)
+ y=reg.y
+ X=reg.x
+ # constant = constant_check(X)
+
+ # Check for constant, if none add one, see Greene 2003, pg. 222
+ # if constant == False:
+ # X = np.hstack((np.ones((n,1)),X))
+
+ # Check for multicollinearity in the X matrix
+ ci=condition_index(reg)
+ ifci>30:
+ white_result="Not computed due to multicollinearity."
+ returnwhite_result
+
+ # Compute cross-products and squares of the regression variables
+ iftype(X).__name__=="ndarray":
+ A=np.zeros((n,(k*(k+1))//2))
+ eliftype(X).__name__=="csc_matrix"ortype(X).__name__=="csr_matrix":
+ # this is probably inefficient
+ A=SP.lil_matrix((n,(k*(k+1))//2))
+ else:
+ raiseException("unknown X type, %s"%type(X).__name__)
+ counter=0
+ foriinrange(k):
+ forjinrange(i,k):
+ v=spmultiply(X[:,i],X[:,j],False)
+ A[:,counter]=v
+ counter+=1
+
+ # Append the original variables
+ A=sphstack(X,A)# note: this also converts a LIL to CSR
+ n,k=A.shape
+
+ # Check to identify any duplicate or constant columns in A
+ omitcolumn=[]
+ foriinrange(k):
+ current=A[:,i]
+ # remove all constant terms (will add a constant back later)
+ ifspmax(current)==spmin(current):
+ omitcolumn.append(i)
+ pass
+ # do not allow duplicates
+ forjinrange(k):
+ check=A[:,j]
+ ifi<j:
+ test=abs(current-check).sum()
+ iftest==0:
+ omitcolumn.append(j)
+ uniqueomit=set(omitcolumn)
+ omitcolumn=list(uniqueomit)
+
+ # Now the identified columns must be removed
+ iftype(A).__name__=="ndarray":
+ A=np.delete(A,omitcolumn,1)
+ eliftype(A).__name__=="csc_matrix"ortype(A).__name__=="csr_matrix":
+ # this is probably inefficient
+ keepcolumn=list(range(k))
+ foriinomitcolumn:
+ keepcolumn.remove(i)
+ A=A[:,keepcolumn]
+ else:
+ raiseException("unknown A type, %s"%type(X).__name__)
+ A=sphstack(np.ones((A.shape[0],1)),A)# add a constant back in
+ n,k=A.shape
+
+ # Conduct the auxiliary regression and calculate the statistic
+ from.importolsasOLS
+
+ aux_reg=OLS.BaseOLS(e,A)
+ aux_r2=r2(aux_reg)
+ wh=aux_r2*n
+ df=k-1
+ pvalue=stats.chisqprob(wh,df)
+ white_result={"df":df,"wh":wh,"pvalue":pvalue}
+ returnwhite_result
+
+
+
+
+[docs]
+defkoenker_bassett(reg,z=None):
+"""
+ Calculates the Koenker-Bassett test statistic to check for
+ heteroscedasticity. :cite:`Koenker1982,Greene2003`
+
+ Parameters
+ ----------
+ reg : regression output
+ output from an instance of a regression class
+ z : array
+ optional input for specifying an alternative set of
+ variables (Z) to explain the observed variance. By
+ default this is a matrix of the squared explanatory
+ variables (X**2) with a constant added to the first
+ column if not already present. In the default case,
+ the explanatory variables are squared to eliminate
+ negative values.
+
+ Returns
+ -------
+ kb_result : dictionary
+ contains the statistic (kb), degrees of freedom (df)
+ and the associated p-value (pvalue) for the test.
+ kb : float
+ scalar value for the Koenker-Bassett test statistic.
+ df : integer
+ degrees of freedom associated with the test
+ pvalue : float
+ p-value associated with the statistic (chi^2
+ distributed)
+
+ Notes
+ -----
+ x attribute in the reg object must have a constant term included. This is
+ standard for spreg.OLS so no testing done to confirm constant.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the Koenker-Bassett test for heteroscedasticity.
+
+ >>> testresult = spreg.koenker_bassett(reg)
+
+ Print the degrees of freedom for the test.
+
+ >>> testresult['df']
+ 2
+
+ Print the test statistic.
+
+ >>> print("%1.3f"%testresult['kb'])
+ 5.694
+
+ Print the associated p-value.
+
+ >>> print("%1.4f"%testresult['pvalue'])
+ 0.0580
+
+ """
+ # The notation here matches that of Greene (2003).
+ u=reg.u**2
+ e=reg.u
+ n=reg.n
+ k=reg.k
+ x=reg.x
+ ete=reg.utu
+ # constant = constant_check(x)
+
+ ubar=ete/n
+ ubari=ubar*np.ones((n,1))
+ g=u-ubari
+ v=(1.0/n)*np.sum((u-ubar)**2)
+
+ ifz==None:
+ x=reg.x
+ # constant = constant_check(x)
+ # if constant == False:
+ # z = np.hstack((np.ones((n,1)),x))**2
+ # else:
+ # z = x**2
+ z=spmultiply(x,x)
+ else:
+ # constant = constant_check(z)
+ # if constant == False:
+ # z = np.hstack((np.ones((n,1)),z))
+ pass
+
+ n,p=z.shape
+
+ # Check to identify any duplicate columns in Z
+ omitcolumn=[]
+ foriinrange(p):
+ current=z[:,i]
+ forjinrange(p):
+ check=z[:,j]
+ ifi<j:
+ test=abs(current-check).sum()
+ iftest==0:
+ omitcolumn.append(j)
+
+ uniqueomit=set(omitcolumn)
+ omitcolumn=list(uniqueomit)
+
+ # Now the identified columns must be removed (done in reverse to
+ # prevent renumbering)
+ omitcolumn.sort()
+ omitcolumn.reverse()
+ forcinomitcolumn:
+ z=np.delete(z,c,1)
+ n,p=z.shape
+
+ df=p-1
+
+ # Conduct the auxiliary regression.
+ zt=np.transpose(z)
+ gt=np.transpose(g)
+ gtz=np.dot(gt,z)
+ ztg=np.dot(zt,g)
+ ztz=np.dot(zt,z)
+ ztzi=la.inv(ztz)
+
+ part1=np.dot(gtz,ztzi)
+ part2=np.dot(part1,ztg)
+ kb_array=(1.0/v)*part2
+ kb=kb_array[0,0]
+
+ pvalue=stats.chisqprob(kb,df)
+ kb_result={"kb":kb,"df":df,"pvalue":pvalue}
+ returnkb_result
+
+
+
+
+[docs]
+defvif(reg):
+"""
+ Calculates the variance inflation factor for each independent variable.
+ For the ease of indexing the results, the constant is currently
+ included. This should be omitted when reporting the results to the
+ output text. :cite:`Greene2003`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ -------
+ vif_result : list of tuples
+ each tuple includes the vif and the tolerance, the
+ order of the variables corresponds to their order in
+ the reg.x matrix
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Read the DBF associated with the Columbus data.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+
+ Create the dependent variable vector.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Create the matrix of independent variables.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression.
+
+ >>> reg = OLS(y,X)
+
+ Calculate the variance inflation factor (VIF).
+ >>> testresult = spreg.vif(reg)
+
+ Select the tuple for the income variable.
+
+ >>> incvif = testresult[1]
+
+ Print the VIF for income.
+
+ >>> print("%12.12f"%incvif[0])
+ 1.333117497189
+
+ Print the tolerance for income.
+
+ >>> print("%12.12f"%incvif[1])
+ 0.750121427487
+
+ Repeat for the home value variable.
+
+ >>> hovalvif = testresult[2]
+ >>> print("%12.12f"%hovalvif[0])
+ 1.333117497189
+ >>> print("%12.12f"%hovalvif[1])
+ 0.750121427487
+
+ """
+ X=reg.x
+ n,k=X.shape
+ vif_result=[]
+
+ forjinrange(k):
+ Z=X.copy()
+ Z=np.delete(Z,j,1)
+ y=X[:,j]
+ from.importolsasOLS
+
+ aux=OLS.BaseOLS(y,Z)
+ mean_y=aux.mean_y
+ utu=aux.utu
+ ss_tot=sum((y-mean_y)**2)
+ ifss_tot==0:
+ resj=MISSINGVALUE
+ else:
+ r2aux=1-utu/ss_tot
+ tolj=1-r2aux
+ vifj=1/tolj
+ resj=(vifj,tolj)
+ vif_result.append(resj)
+ returnvif_result
+
+
+
+
+[docs]
+defconstant_check(array):
+"""
+ Checks to see numpy array includes a constant.
+
+ Parameters
+ ----------
+ array : array
+ an array of variables to be inspected
+
+ Returns
+ -------
+ constant : boolean
+ true signifies the presence of a constant
+
+ Example
+ -------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),"r")
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+ >>> reg = OLS(y,X)
+ >>> spreg.constant_check(reg.x)
+ True
+
+ """
+
+ n,k=array.shape
+ constant=False
+ forjinrange(k):
+ variable=array[:,j]
+ varmin=variable.min()
+ varmax=variable.max()
+ ifvarmin==varmax:
+ constant=True
+ break
+ returnconstant
+
+
+
+
+[docs]
+deflikratiotest(reg0,reg1):
+"""
+ Likelihood ratio test statistic :cite:`Greene2003`
+
+ Parameters
+ ----------
+ reg0 : regression object
+ for constrained model (H0)
+ reg1 : regression object
+ for unconstrained model (H1)
+
+ Returns
+ -------
+
+ likratio : dictionary
+ contains the statistic (likr), the degrees of
+ freedom (df) and the p-value (pvalue)
+ likr : float
+ likelihood ratio statistic
+ df : integer
+ degrees of freedom
+ p-value : float
+ p-value
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import scipy.stats as stats
+ >>> from spreg import ML_Lag, OLS
+ >>> from spreg import likratiotest
+
+ Use the baltim sample data set
+
+ >>> db = libpysal.io.open(examples.get_path("baltim.dbf"),'r')
+ >>> y_name = "PRICE"
+ >>> y = np.array(db.by_col(y_name)).T
+ >>> y.shape = (len(y),1)
+ >>> x_names = ["NROOM","NBATH","PATIO","FIREPL","AC","GAR","AGE","LOTSZ","SQFT"]
+ >>> x = np.array([db.by_col(var) for var in x_names]).T
+ >>> ww = libpysal.io.open(examples.get_path("baltim_q.gal"))
+ >>> w = ww.read()
+ >>> ww.close()
+ >>> w.transform = 'r'
+
+ OLS regression
+
+ >>> ols1 = OLS(y,x)
+
+ ML Lag regression
+
+ >>> mllag1 = ML_Lag(y,x,w)
+
+ >>> lr = likratiotest(ols1,mllag1)
+
+ >>> print("Likelihood Ratio Test: {0:.4f} df: {1} p-value: {2:.4f}".format(lr["likr"],lr["df"],lr["p-value"]))
+ Likelihood Ratio Test: 44.5721 df: 1 p-value: 0.0000
+
+ """
+
+ likratio={}
+
+ try:
+ likr=2.0*(reg1.logll-reg0.logll)
+ exceptAttributeError:
+ raiseException("Missing or improper log-likelihoods in regression objects")
+ iflikr<0.0:# always enforces positive likelihood ratio
+ likr=-likr
+
+ # generalize to multiple parameters, e.g., spatial Durbin
+ df=reg1.k-reg0.k
+ ifnot(df>0):
+ df=1
+
+ #pvalue = stats.chisqprob(likr, 1)
+ pvalue=stats.chisqprob(likr,df)
+ #likratio = {"likr": likr, "df": 1, "p-value": pvalue}
+ likratio={"likr":likr,"df":df,"p-value":pvalue}
+ returnlikratio
+
+
+
+[docs]
+defdwh(reg):
+"""
+ Durbin-Wu-Hausman test on endogeneity of variables
+
+ A significant test statistic points to endogeneity
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+
+ Returns
+ -------
+ dwh : tuple with value of F-statistic in augmented regression
+ and associated p-value
+
+ """
+ n=reg.n
+ ny=reg.yend.shape[1]# number of endogenous variables
+ qq=reg.h# all exogenous and instruments
+ xx=reg.z# all exogenous and endogenous
+ # get predicted values for endogenous variables on all instruments
+ py=np.zeros((n,ny))
+ foriinrange(ny):
+ yy=reg.yend[:,i].reshape(n,1)
+ ols1=BaseOLS(y=yy,x=qq)
+ yp=ols1.predy
+ py[0:n,i]=yp.flatten()
+ nxq=sphstack(xx,py)
+ # F-test in augmented regression
+ ols2=BaseOLS(y=reg.y,x=nxq)
+ dwh=f_stat(ols2,df=ny)
+ returndwh
+[docs]
+classLMtests:
+"""
+ Lagrange Multiplier tests. Implemented as presented in :cite:`Anselin1996a` and :cite:`KoleyBera2024`
+
+ Attributes
+ ----------
+
+ ols : OLS
+ OLS regression object
+ w : W
+ Spatial weights instance
+ tests : list
+ Lists of strings with the tests desired to be performed.
+ Values may be:
+
+ * 'all': runs all the options (default)
+ * 'lme': LM error test
+ * 'rlme': Robust LM error test
+ * 'lml' : LM lag test
+ * 'rlml': Robust LM lag test
+ * 'sarma': LM SARMA test
+ * 'lmwx': LM test for WX
+ * 'rlmwx': Robust LM WX test
+ * 'lmspdurbin': Joint test for SDM
+ * 'rlmdurlag': Robust LM Lag - SDM
+
+ Parameters
+ ----------
+
+ lme : tuple
+ (Only if 'lme' or 'all' was in tests). Pair of statistic and
+ p-value for the LM error test.
+ lml : tuple
+ (Only if 'lml' or 'all' was in tests). Pair of statistic and
+ p-value for the LM lag test.
+ rlme : tuple
+ (Only if 'rlme' or 'all' was in tests). Pair of statistic
+ and p-value for the Robust LM error test.
+ rlml : tuple
+ (Only if 'rlml' or 'all' was in tests). Pair of statistic
+ and p-value for the Robust LM lag test.
+ sarma : tuple
+ (Only if 'sarma' or 'all' was in tests). Pair of statistic
+ and p-value for the SARMA test.
+ lmwx : tuple
+ (Only if 'lmwx' or 'all' was in tests). Pair of statistic
+ and p-value for the LM test for WX.
+ rlmwx : tuple
+ (Only if 'rlmwx' or 'all' was in tests). Pair of statistic
+ and p-value for the Robust LM WX test.
+ rlmdurlag : tuple
+ (Only if 'rlmdurlag' or 'all' was in tests). Pair of statistic
+ and p-value for the Robust LM Lag - SDM test.
+ lmspdurbin : tuple
+ (Only if 'lmspdurbin' or 'all' was in tests). Pair of statistic
+ and p-value for the Joint test for SDM.
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import OLS
+ >>> import spreg
+
+ Open the csv file to access the data for analysis
+
+ >>> csv = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Pull out from the csv the files we need ('HOVAL' as dependent as well as
+ 'INC' and 'CRIME' as independent) and directly transform them into nx1 and
+ nx2 arrays, respectively
+
+ >>> y = np.array([csv.by_col('HOVAL')]).T
+ >>> x = np.array([csv.by_col('INC'), csv.by_col('CRIME')]).T
+
+ Create the weights object from existing .gal file
+
+ >>> w = libpysal.io.open(libpysal.examples.get_path('columbus.gal'), 'r').read()
+
+ Row-standardize the weight object (not required although desirable in some
+ cases)
+
+ >>> w.transform='r'
+
+ Run an OLS regression
+
+ >>> ols = OLS(y, x)
+
+ Run all the LM tests in the residuals. These diagnostics test for the
+ presence of remaining spatial autocorrelation in the residuals of an OLS
+ model and give indication about the type of spatial model. There are five
+ types: presence of a spatial lag model (simple and robust version),
+ presence of a spatial error model (simple and robust version) and joint presence
+ of both a spatial lag as well as a spatial error model.
+
+ >>> lms = spreg.LMtests(ols, w)
+
+ LM error test:
+
+ >>> print(round(lms.lme[0],4), round(lms.lme[1],4))
+ 3.0971 0.0784
+
+ LM lag test:
+
+ >>> print(round(lms.lml[0],4), round(lms.lml[1],4))
+ 0.9816 0.3218
+
+ Robust LM error test:
+
+ >>> print(round(lms.rlme[0],4), round(lms.rlme[1],4))
+ 3.2092 0.0732
+
+ Robust LM lag test:
+
+ >>> print(round(lms.rlml[0],4), round(lms.rlml[1],4))
+ 1.0936 0.2957
+
+ LM SARMA test:
+
+ >>> print(round(lms.sarma[0],4), round(lms.sarma[1],4))
+ 4.1907 0.123
+
+ LM test for WX:
+
+ >>> print(round(lms.lmwx[0],4), round(lms.lmwx[1],4))
+ 1.3377 0.5123
+
+ Robust LM WX test:
+
+ >>> print(round(lms.rlmwx[0],4), round(lms.rlmwx[1],4))
+ 3.4532 0.1779
+
+ Robust LM Lag - SDM:
+ >>> print(round(lms.rlmdurlag[0],4), round(lms.rlmdurlag[1],4))
+ 3.0971 0.0784
+
+ Joint test for SDM:
+
+ >>> print(round(lms.lmspdurbin[0],4), round(lms.lmspdurbin[1],4))
+ 4.4348 0.2182
+ """
+
+
+[docs]
+ def__init__(self,ols,w,tests=["all"]):
+ cache=spDcache(ols,w)
+ iftests==["all"]:
+ tests=["lme","lml","rlme","rlml","sarma","lmwx","lmspdurbin","rlmwx",
+ "rlmdurlag","lmslxerr"]# added back in for access
+ ifany(testin["lme","lmslxerr"]fortestintests):
+ #if "lme" in tests:
+ self.lme=lmErr(ols,w,cache)
+ ifany(testin["lml","rlmwx"]fortestintests):
+ self.lml=lmLag(ols,w,cache)
+ if"rlme"intests:
+ self.rlme=rlmErr(ols,w,cache)
+ if"rlml"intests:
+ self.rlml=rlmLag(ols,w,cache)
+ if"sarma"intests:
+ self.sarma=lmSarma(ols,w,cache)
+ #if any(test in ["lmwx", "rlmdurlag", "lmslxerr"] for test in tests):
+ ifany(testin["lmwx","rlmdurlag","lmslxerr"]fortestintests):
+ self.lmwx=lm_wx(ols,w)
+ ifany(testin["lmspdurbin","rlmdurlag","rlmwx"]fortestintests):
+ self.lmspdurbin=lm_spdurbin(ols,w)
+ if"rlmwx"intests:
+ self.rlmwx=rlm_wx(ols,self.lmspdurbin,self.lml)
+ if"rlmdurlag"intests:
+ self.rlmdurlag=rlm_durlag(self.lmspdurbin,self.lmwx)
+ if"lmslxerr"intests:#currently removed - LA added back in for access
+ self.lmslxerr=lm_slxerr(ols,self.lme,self.lmwx)
+
+
+
+
+[docs]
+classMoranRes:
+"""
+ Moran's I for spatial autocorrelation in residuals from OLS regression
+
+
+ Parameters
+ ----------
+ ols : OLS
+ OLS regression object
+ w : W
+ Spatial weights instance
+ z : boolean
+ If set to True computes attributes eI, vI and zI. Due to computational burden of vI, defaults to False.
+
+ Attributes
+ ----------
+ I : float
+ Moran's I statistic
+ eI : float
+ Moran's I expectation
+ vI : float
+ Moran's I variance
+ zI : float
+ Moran's I standardized value
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import OLS
+ >>> import spreg
+
+ Open the csv file to access the data for analysis
+
+ >>> csv = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Pull out from the csv the files we need ('HOVAL' as dependent as well as
+ 'INC' and 'CRIME' as independent) and directly transform them into nx1 and
+ nx2 arrays, respectively
+
+ >>> y = np.array([csv.by_col('HOVAL')]).T
+ >>> x = np.array([csv.by_col('INC'), csv.by_col('CRIME')]).T
+
+ Create the weights object from existing .gal file
+
+ >>> w = libpysal.io.open(libpysal.examples.get_path('columbus.gal'), 'r').read()
+
+ Row-standardize the weight object (not required although desirable in some
+ cases)
+
+ >>> w.transform='r'
+
+ Run an OLS regression
+
+ >>> ols = OLS(y, x)
+
+ Run Moran's I test for residual spatial autocorrelation in an OLS model.
+ This computes the traditional statistic applying a correction in the
+ expectation and variance to account for the fact it comes from residuals
+ instead of an independent variable
+
+ >>> m = spreg.MoranRes(ols, w, z=True)
+
+ Value of the Moran's I statistic:
+
+ >>> print(round(m.I,4))
+ 0.1713
+
+ Value of the Moran's I expectation:
+
+ >>> print(round(m.eI,4))
+ -0.0345
+
+ Value of the Moran's I variance:
+
+ >>> print(round(m.vI,4))
+ 0.0081
+
+ Value of the Moran's I standardized value. This is
+ distributed as a standard Normal(0, 1)
+
+ >>> print(round(m.zI,4))
+ 2.2827
+
+ P-value of the standardized Moran's I value (z):
+
+ >>> print(round(m.p_norm,4))
+ 0.0224
+ """
+
+
+[docs]
+classAKtest:
+r"""
+ Moran's I test of spatial autocorrelation for IV estimation.
+ Implemented following the original reference :cite:`Anselin1997`
+
+
+ Parameters
+ ----------
+
+ iv : TSLS
+ Regression object from TSLS class
+ w : W
+ Spatial weights instance
+ case : string
+ Flag for special cases (default to 'nosp'):
+
+ * 'nosp': Only NO spatial end. reg.
+ * 'gen': General case (spatial lag + end. reg.)
+
+ Attributes
+ ----------
+
+ mi : float
+ Moran's I statistic for IV residuals
+ ak : float
+ Square of corrected Moran's I for residuals
+ :math:`ak = \dfrac{N \times I^*}{\phi^2}`.
+ Note: if case='nosp' then it simplifies to the LMerror
+ p : float
+ P-value of the test
+
+ Examples
+ --------
+
+ We first need to import the needed modules. Numpy is needed to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis. The TSLS is required to run the model on
+ which we will perform the tests.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import TSLS, GM_Lag, AKtest
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),'r')
+
+ Before being able to apply the diagnostics, we have to run a model and,
+ for that, we need the input variables. Extract the CRIME column (crime
+ rates) from the DBF file and make it the dependent variable for the
+ regression. Note that PySAL requires this to be an numpy array of shape
+ (n, 1) as opposed to the also common shape of (n, ) that other packages
+ accept.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this model adds a vector of ones to the
+ independent variables passed in, but this can be overridden by passing
+ constant=False.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+
+ In this case, we consider HOVAL (home value) as an endogenous regressor,
+ so we acknowledge that by reading it in a different category.
+
+ >>> yd = []
+ >>> yd.append(db.by_col("HOVAL"))
+ >>> yd = np.array(yd).T
+
+ In order to properly account for the endogeneity, we have to pass in the
+ instruments. Let us consider DISCBD (distance to the CBD) is a good one:
+
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ Now we are good to run the model. It is an easy one line task.
+
+ >>> reg = TSLS(y, X, yd, q=q)
+
+ Now we are concerned with whether our non-spatial model presents spatial
+ autocorrelation in the residuals. To assess this possibility, we can run
+ the Anselin-Kelejian test, which is a version of the classical LM error
+ test adapted for the case of residuals from an instrumental variables (IV)
+ regression. First we need an extra object, the weights matrix, which
+ includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are good to run the test. It is a very simple task:
+
+ >>> ak = AKtest(reg, w)
+
+ And explore the information obtained:
+
+ >>> print('AK test: %f\tP-value: %f'%(ak.ak, ak.p))
+ AK test: 4.642895 P-value: 0.031182
+
+ The test also accomodates the case when the residuals come from an IV
+ regression that includes a spatial lag of the dependent variable. The only
+ requirement needed is to modify the ``case`` parameter when we call
+ ``AKtest``. First, let us run a spatial lag model:
+
+ >>> reg_lag = GM_Lag(y, X, yd, q=q, w=w)
+
+ And now we can run the AK test and obtain similar information as in the
+ non-spatial model.
+
+ >>> ak_sp = AKtest(reg, w, case='gen')
+ >>> print('AK test: %f\tP-value: %f'%(ak_sp.ak, ak_sp.p))
+ AK test: 1.157593 P-value: 0.281965
+
+ """
+
+
+[docs]
+ def__init__(self,iv,w,case="nosp"):
+ ifcase=="gen":
+ cache=spDcache(iv,w)
+ self.mi,self.ak,self.p=akTest(iv,w,cache)
+ elifcase=="nosp":
+ cache=spDcache(iv,w)
+ self.mi=get_mI(iv,w,cache)
+ self.ak,self.p=lmErr(iv,w,cache)
+ else:
+ print(
+"""\n
+ Fix the optional argument 'case' to match the requirements:
+ * 'gen': General case (spatial lag + end. reg.)
+ * 'nosp': No spatial end. reg.
+ \n"""
+ )
+
+
+
+
+classspDcache:
+r"""
+ Helper class to compute reusable pieces in the spatial diagnostics module
+ ...
+
+ Parameters
+ ----------
+
+ reg : OLS_dev, TSLS_dev, STSLS_dev
+ Instance from a regression class
+ w : W
+ Spatial weights instance
+
+ Attributes
+ ----------
+
+ j : array
+ 1x1 array with the result from:
+ :math:`J = \dfrac{1}{[(WX\beta)' M (WX\beta) + T \sigma^2]}`
+ wu : array
+ nx1 array with spatial lag of the residuals
+ utwuDs : array
+ 1x1 array with the result from:
+ :math:`utwuDs = \dfrac{u' W u}{\tilde{\sigma^2}}`
+ utwyDs : array
+ 1x1 array with the result from:
+ :math:`utwyDs = \dfrac{u' W y}{\tilde{\sigma^2}}`
+ t : array
+ 1x1 array with the result from :
+ :math:` T = tr[(W' + W) W]`
+ trA : float
+ Trace of A as in Cliff & Ord (1981)
+
+ """
+
+ def__init__(self,reg,w):
+ self.reg=reg
+ self.w=w
+ self._cache={}
+
+ @property
+ defj(self):
+ if"j"notinself._cache:
+ wxb=self.w.sparse*self.reg.predy
+ wxb2=np.dot(wxb.T,wxb)
+ xwxb=spdot(self.reg.x.T,wxb)
+ num1=wxb2-np.dot(xwxb.T,np.dot(self.reg.xtxi,xwxb))
+ num=num1+(self.t*self.reg.sig2n)
+ den=self.reg.n*self.reg.sig2n
+ self._cache["j"]=num/den
+ returnself._cache["j"]
+
+ @property
+ defwu(self):
+ if"wu"notinself._cache:
+ self._cache["wu"]=self.w.sparse*self.reg.u
+ returnself._cache["wu"]
+
+ @property
+ defutwuDs(self):
+ if"utwuDs"notinself._cache:
+ res=np.dot(self.reg.u.T,self.wu)/self.reg.sig2n
+ self._cache["utwuDs"]=res
+ returnself._cache["utwuDs"]
+
+ @property
+ defutwyDs(self):
+ if"utwyDs"notinself._cache:
+ res=np.dot(self.reg.u.T,self.w.sparse*self.reg.y)
+ self._cache["utwyDs"]=res/self.reg.sig2n
+ returnself._cache["utwyDs"]
+
+ @property
+ deft(self):
+ if"t"notinself._cache:
+ prod=(self.w.sparse.T+self.w.sparse)*self.w.sparse
+ self._cache["t"]=np.sum(prod.diagonal())
+ returnself._cache["t"]
+
+ @property
+ deftrA(self):
+ if"trA"notinself._cache:
+ xtwx=spdot(self.reg.x.T,spdot(self.w.sparse,self.reg.x))
+ mw=np.dot(self.reg.xtxi,xtwx)
+ self._cache["trA"]=np.sum(mw.diagonal())
+ returnself._cache["trA"]
+
+ @property
+ defAB(self):
+"""
+ Computes A and B matrices as in Cliff-Ord 1981, p. 203
+ """
+ if"AB"notinself._cache:
+ U=(self.w.sparse+self.w.sparse.T)/2.0
+ z=spdot(U,self.reg.x,array_out=False)
+ c1=spdot(self.reg.x.T,z,array_out=False)
+ c2=spdot(z.T,z,array_out=False)
+ G=self.reg.xtxi
+ A=spdot(G,c1)
+ B=spdot(G,c2)
+ self._cache["AB"]=[A,B]
+ returnself._cache["AB"]
+
+
+deflmErr(reg,w,spDcache):
+"""
+ LM error test. Implemented as presented in eq. (9) of Anselin et al.
+ (1996) :cite:`Anselin1996a`.
+
+ Attributes
+ ----------
+ reg : OLS_dev, TSLS_dev, STSLS_dev
+ Instance from a regression class
+ w : W
+ Spatial weights instance
+ spDcache : spDcache
+ Instance of spDcache class
+
+ Returns
+ -------
+ lme : tuple
+ Pair of statistic and p-value for the LM error test.
+
+ """
+ lm=spDcache.utwuDs**2/spDcache.t
+ pval=chisqprob(lm,1)
+ return(lm[0][0],pval[0][0])
+
+
+deflmLag(ols,w,spDcache):
+"""
+ LM lag test. Implemented as presented in eq. (13) of Anselin et al.
+ (1996) :cite:`Anselin1996a`.
+
+ Attributes
+ ----------
+ ols : OLS_dev
+ Instance from an OLS_dev regression
+ w : W
+ Spatial weights instance
+ spDcache : spDcache
+ Instance of spDcache class
+
+ Returns
+ -------
+ lml : tuple
+ Pair of statistic and p-value for the LM lag test.
+
+ """
+ lm=spDcache.utwyDs**2/(ols.n*spDcache.j)
+ pval=chisqprob(lm,1)
+ return(lm[0][0],pval[0][0])
+
+
+defrlmErr(ols,w,spDcache):
+"""
+ Robust LM error test. Implemented as presented in eq. (8) of Anselin et
+ al. (1996) :cite:`Anselin1996a`.
+
+ NOTE: eq. (8) has an errata, the power -1 in the denominator
+ should be inside the square bracket.
+
+ Attributes
+ ----------
+ ols : OLS_dev
+ Instance from an OLS_dev regression
+ w : W
+ Spatial weights instance
+ spDcache : spDcache
+ Instance of spDcache class
+
+ Returns
+ -------
+ rlme : tuple
+ Pair of statistic and p-value for the Robust LM error test.
+
+ """
+ nj=ols.n*spDcache.j
+ num=(spDcache.utwuDs-(spDcache.t*spDcache.utwyDs)/nj)**2
+ den=spDcache.t*(1.0-(spDcache.t/nj))
+ lm=num/den
+ pval=chisqprob(lm,1)
+ return(lm[0][0],pval[0][0])
+
+
+defrlmLag(ols,w,spDcache):
+"""
+ Robust LM lag test. Implemented as presented in eq. (12) of Anselin et al.
+ (1996) :cite:`Anselin1996a`.
+
+ Attributes
+ ----------
+ ols : OLS_dev
+ Instance from an OLS_dev regression
+ w : W
+ Spatial weights instance
+ spDcache : spDcache
+ Instance of spDcache class
+
+ Returns
+ -------
+ rlml : tuple
+ Pair of statistic and p-value for the Robust LM lag test.
+
+ """
+ lm=(spDcache.utwyDs-spDcache.utwuDs)**2/((ols.n*spDcache.j)-spDcache.t)
+ pval=chisqprob(lm,1)
+ return(lm[0][0],pval[0][0])
+
+
+deflmSarma(ols,w,spDcache):
+"""
+ LM error test. Implemented as presented in eq. (15) of Anselin et al.
+ (1996) :cite:`Anselin1996a`.
+
+ Attributes
+ ----------
+ ols : OLS_dev
+ Instance from an OLS_dev regression
+ w : W
+ Spatial weights instance
+ spDcache : spDcache
+ Instance of spDcache class
+
+ Returns
+ -------
+ sarma : tuple
+ Pair of statistic and p-value for the LM sarma test.
+
+ """
+
+ first=(spDcache.utwyDs-spDcache.utwuDs)**2/(w.n*spDcache.j-spDcache.t)
+ secnd=spDcache.utwuDs**2/spDcache.t
+ lm=first+secnd
+ pval=chisqprob(lm,2)
+ return(lm[0][0],pval[0][0])
+
+deflm_wx(reg,w):
+"""
+ LM test for WX. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`.
+
+ Attributes
+ ----------
+ reg : OLS
+ Instance from an OLS regression
+ w : W
+ Spatial weights instance
+
+ Returns
+ -------
+ lmwx : tuple
+ Pair of statistic and p-value for the LM test for WX.
+
+ """
+
+ # preliminaries
+ # set up X1 (constant) and X (no constant) as x1 and xx
+ x1=reg.x
+ xx=x1[:,1:]
+ # WX
+ wx=w.sparse*xx
+ # end of preliminaries
+ # X'W'u
+ xtwtu=wx.T@reg.u
+ # X'W'X1(X1'X1)-1X1WX
+ mx1=wx.T@x1
+ mx=(mx1@reg.xtxi)@mx1.T
+ xwwx=wx.T@wx
+ xqx=xwwx-mx
+ xqxi=la.inv(xqx)
+ # RSgamma: (X'W'u)'(X'Q1X)-1(X'W'u) / sig2n
+ xpwpu=wx.T@reg.u
+ rsg1=(xpwpu.T@xqxi)@xpwpu
+ rsgam=rsg1[0][0]/reg.sig2n
+ pval=chisqprob(rsgam,(reg.k-1))
+ rsgamma=(rsgam,pval)
+ return(rsgamma)
+
+deflm_spdurbin(reg,w):
+"""
+ Joint test for SDM. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`.
+
+ Attributes
+ ----------
+ reg : OLS
+ Instance from an OLS regression
+ w : W
+ Spatial weights instance
+
+ Returns
+ -------
+ lmspdurbin : tuple
+ Pair of statistic and p-value for the Joint test for SDM.
+
+ """
+
+ # preliminaries
+ # set up X1 (constant) and X (no constant) as x1 and xx
+ x1=reg.x
+ xx=x1[:,1:]
+ k=x1.shape[1]
+ # WX
+ wx=w.sparse*xx
+ # X1b
+ xb=reg.predy
+ # WX1b
+ wxb=w.sparse*xb
+ # Wy
+ wy=w.sparse*reg.y
+ # y'W'e / sig2n
+ drho=(wy.T@reg.u)/reg.sig2n
+ # X'W'e / sign2n
+ dgam=(wx.T@reg.u)/reg.sig2n
+ # P = T = tr(W2 + W'W)
+ pp=w.trcWtW_WW
+ # end of preliminaries
+ # J_11: block matrix with X1'X1 and n/2sig2n
+ jj1a=np.hstack((reg.xtx,np.zeros((k,1))))
+ jj1b=np.hstack((np.zeros((1,k)),np.array([reg.n/(2.0*reg.sig2n)]).reshape(1,1)))
+ jj11=np.vstack((jj1a,jj1b))
+ # J_12: matrix with k-1 rows X1'WX1b and X1'WX, and 1 row of zeros
+ jj12a=np.hstack((x1.T@wxb,x1.T@wx))
+ jj12=np.vstack((jj12a,np.zeros((1,k))))
+ # J_22 matrix with diagonal elements b'X1'W'WX1b + T.sig2n and X'W'WX
+ # and off-diagonal element b'X1'W'WX
+ jj22a=wxb.T@wxb+pp*reg.sig2n
+ jj22a=jj22a.reshape(1,1)
+ wxbtwx=(wxb.T@wx).reshape(1,k-1)
+ jj22b=np.hstack((jj22a,wxbtwx))
+ wxtwx=wx.T@wx
+ jj22c=np.hstack((wxbtwx.T,wxtwx))
+ jj22=np.vstack((jj22b,jj22c))
+ # J^22 (the inverse) from J^22 = (J_22 - J_21.J_11^-1.J_12)^-1
+ jj11i=la.inv(jj11)
+ j121121=(jj12.T@jj11i)@jj12
+ jj22i1=jj22-j121121
+ jj22i=la.inv(jj22i1)
+ # rescale by sig2n
+ jj22i=jj22i*reg.sig2n
+ # statistic
+ dd=np.vstack((drho,dgam))
+ rsjoint=(dd.T@jj22i)@dd
+ rsjoint=rsjoint[0][0]
+ pval=chisqprob(rsjoint,k)
+ rsrhogam=(rsjoint,pval)
+ return(rsrhogam)
+
+defrlm_wx(reg,lmspdurbin,lmlag):
+"""
+ Robust LM WX test. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`.
+
+ Attributes
+ ----------
+ reg : OLS
+ Instance from an OLS regression
+ lmspdurbin : tuple
+ Joint test for SDM as in lm_spdurbin function
+ lmlag : tuple
+ LM Lag test as in lmLag function
+
+ Returns
+ -------
+ rlmwx : tuple
+ Pair of statistic and p-value for the Robust LM WX test.
+
+ """
+ # robust gamma = rsjoint - rsrho
+ rsgams=lmspdurbin[0]-lmlag[0]
+ pval=chisqprob(rsgams,(reg.k-1))
+ rsgamstar=(rsgams,pval)
+ return(rsgamstar)
+
+defrlm_durlag(lmspdurbin,lmwx):
+"""
+ Robust LM Lag - SDM. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`.
+
+ Attributes
+ ----------
+ lmspdurbin : tuple
+ Joint test for SDM as in lm_spdurbin function
+ lmwx : tuple
+ LM test for WX as in lm_wx function
+
+ Returns
+ -------
+ rlmwx : tuple
+ Pair of statistic and p-value for the Robust LM Lag - SDM test.
+ """
+
+ # robust rho = rsjoint - rsgam
+ rsrhos=lmspdurbin[0]-lmwx[0]
+ pval=chisqprob(rsrhos,1)
+ rsrhostar=(rsrhos,pval)
+ return(rsrhostar)
+
+deflm_slxerr(reg,lmerr,lmwx):
+"""
+ Joint test for Error and WX. Implemented as presented in Koley & Bera (2024) :cite:`KoleyBera2024`.
+
+ Attributes
+ ----------
+ reg : OLS
+ Instance from an OLS regression
+ lmerr : tuple
+ LM Error test as in lmErr function
+ lmwx : tuple
+ LM test for WX as in lm_wx function
+
+ Returns
+ -------
+ rlmwx : tuple
+ Pair of statistic and p-value for the Joint test for Error and WX.
+ """
+ rslamgam=lmerr[0]+lmwx[0]
+ pval=chisqprob(rslamgam,reg.k)
+ rslamgamma=(rslamgam,pval)
+ return(rslamgamma)
+
+defget_mI(reg,w,spDcache):
+"""
+ Moran's I statistic of spatial autocorrelation as showed in Cliff & Ord
+ (1981) :cite:`clifford1981`, p. 201-203
+
+ Attributes
+ ----------
+ reg : OLS_dev, TSLS_dev, STSLS_dev
+ Instance from a regression class
+ w : W
+ Spatial weights instance
+ spDcache : spDcache
+ Instance of spDcache class
+
+ Returns
+ -------
+ moran : float
+ Statistic Moran's I test.
+
+ """
+ mi=(w.n*np.dot(reg.u.T,spDcache.wu))/(w.s0*reg.utu)
+ returnmi[0][0]
+
+
+defget_vI(ols,w,ei,spDcache):
+"""
+ Moran's I variance coded as in :cite:`clifford1981` (p. 201-203) and R's spdep
+ """
+ A=spDcache.AB[0]
+ trA2=np.dot(A,A)
+ trA2=np.sum(trA2.diagonal())
+
+ B=spDcache.AB[1]
+ trB=np.sum(B.diagonal())*4.0
+ vi=(w.n**2/(w.s0**2*(w.n-ols.k)*(w.n-ols.k+2.0)))*(
+ w.s1+2.0*trA2-trB-((2.0*(spDcache.trA**2))/(w.n-ols.k))
+ )
+ returnvi
+
+
+defget_eI(ols,w,spDcache):
+"""
+ Moran's I expectation using matrix M
+ """
+ return-(w.n*spDcache.trA)/(w.s0*(w.n-ols.k))
+
+
+defget_zI(I,ei,vi):
+"""
+ Standardized I
+
+ Returns two-sided p-values as provided in the GeoDa family
+ """
+ #z = abs((I - ei) / np.sqrt(vi))
+ z=(I-ei)/np.sqrt(vi)
+ #pval = norm.sf(z) * 2.0
+ pval=norm.sf(abs(z))*2.0
+ return(z,pval)
+
+
+defakTest(iv,w,spDcache):
+r"""
+ Computes AK-test for the general case (end. reg. + sp. lag)
+
+ Parameters
+ ----------
+
+ iv : STSLS_dev
+ Instance from spatial 2SLS regression
+ w : W
+ Spatial weights instance
+ spDcache : spDcache
+ Instance of spDcache class
+
+ Attributes
+ ----------
+ mi : float
+ Moran's I statistic for IV residuals
+ ak : float
+ Square of corrected Moran's I for residuals:
+ :math:`ak = \dfrac{N \times I^*}{\phi^2}`
+ p : float
+ P-value of the test
+
+ """
+ mi=get_mI(iv,w,spDcache)
+ # Phi2
+ etwz=spdot(iv.u.T,spdot(w.sparse,iv.z))
+ a=np.dot(etwz,np.dot(iv.varb,etwz.T))
+ s12=(w.s0/w.n)**2
+ phi2=(spDcache.t+(4.0/iv.sig2n)*a)/(s12*w.n)
+ ak=w.n*mi**2/phi2
+ pval=chisqprob(ak,1)
+ return(mi,ak[0][0],pval[0][0])
+
+
+defcomfac_test(rho,beta,gamma,vm):
+"""
+ Computes the Spatial Common Factor Hypothesis test as shown in Anselin (1988, p. 226-229).
+ Note that for the Common Factor Hypothesis test to be valid, gamma has to equal
+ *negative* rho times beta for all beta parameters.
+ That is, when rho is positive, a positive beta means gamma must be negative and vice versa.
+ For a negative rho, beta, and gamma must have the same sign.
+ If those signs are not compatible, the test will not be meaningful.
+
+ Parameters
+ ----------
+
+ rho : float
+ Spatial autoregressive coefficient (as in rho*Wy)
+ beta : array
+ Coefficients of the exogenous (not spatially lagged) variables, without the constant (as in X*beta)
+ gamma : array
+ coefficients of the spatially lagged exogenous variables (as in WX*gamma)
+ vm : array
+ Variance-covariance matrix of the coefficients
+ Obs. Needs to match the order of theta' = [beta', gamma', lambda]
+
+ Returns
+ -------
+ W : float
+ Wald statistic
+ pvalue : float
+ P value for Wald statistic calculated as a Chi sq. distribution
+ with k-1 degrees of freedom
+
+ """
+ g=rho*beta+gamma
+ G=np.vstack((rho*np.eye(beta.shape[0]),np.eye(beta.shape[0]),beta.T))
+
+ GVGi=la.inv(np.dot(G.T,np.dot(vm,G)))
+ W=np.dot(g.T,np.dot(GVGi,g))[0][0]
+ df=G.shape[1]
+ pvalue=chisqprob(W,df)
+ returnW,pvalue
+
+
+def_test():
+ importdoctest
+
+ doctest.testmod()
+
+
+if__name__=="__main__":
+ _test()
+
+"""
+Diagnostics for SUR and 3SLS estimation
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, \
+ Pedro V. Amaral pedrovma@gmail.com \
+ Tony Aburaad taburaad@uchicago.edu"
+
+
+importnumpyasnp
+importscipy.statsasstats
+importnumpy.linalgasla
+from.sur_utilsimportsur_dict2mat,sur_mat2dict,sur_corr,spdot
+from.regimesimportbuildR1var,wald_test
+
+
+__all__=["sur_setp","sur_lrtest","sur_lmtest","lam_setp","surLMe","surLMlag"]
+
+
+
+[docs]
+defsur_setp(bigB,varb):
+"""
+ Utility to compute standard error, t and p-value
+
+ Parameters
+ ----------
+ bigB : dictionary
+ of regression coefficient estimates,
+ one vector by equation
+ varb : array
+ variance-covariance matrix of coefficients
+
+ Returns
+ -------
+ surinfdict : dictionary
+ with standard error, t-value, and
+ p-value array, one for each equation
+
+ """
+ vvb=varb.diagonal()
+ n_eq=len(bigB.keys())
+ bigK=np.zeros((n_eq,1),dtype=np.int_)
+ forrinrange(n_eq):
+ bigK[r]=bigB[r].shape[0]
+ b=sur_dict2mat(bigB)
+ se=np.sqrt(vvb)
+ se.resize(len(se),1)
+ t=np.divide(b,se)
+ tp=stats.norm.sf(abs(t))*2
+ surinf=np.hstack((se,t,tp))
+ surinfdict=sur_mat2dict(surinf,bigK)
+ returnsurinfdict
+
+
+
+
+[docs]
+deflam_setp(lam,vm):
+"""
+ Standard errors, t-test and p-value for lambda in SUR Error ML
+
+ Parameters
+ ----------
+ lam : array
+ n_eq x 1 array with ML estimates for spatial error
+ autoregressive coefficient
+ vm : array
+ n_eq x n_eq subset of variance-covariance matrix for
+ lambda and Sigma in SUR Error ML
+ (needs to be subset from full vm)
+
+ Returns
+ -------
+ : tuple
+ with arrays for standard error, t-value and p-value
+ (each element in the tuple is an n_eq x 1 array)
+
+ """
+ vvb=vm.diagonal()
+ se=np.sqrt(vvb)
+ se.resize(len(se),1)
+ t=np.divide(lam,se)
+ tp=stats.norm.sf(abs(t))*2
+ return(se,t,tp)
+
+
+
+
+[docs]
+defsur_lrtest(n,n_eq,ldetS0,ldetS1):
+"""
+ Likelihood Ratio test on off-diagonal elements of Sigma
+
+ Parameters
+ ----------
+ n : int
+ cross-sectional dimension (number of observations for an equation)
+ n_eq : int
+ number of equations
+ ldetS0 : float
+ log determinant of Sigma for OLS case
+ ldetS1 : float
+ log determinant of Sigma for SUR case (should be iterated)
+
+ Returns
+ -------
+ (lrtest,M,pvalue) : tuple
+ with value of test statistic (lrtest),
+ degrees of freedom (M, as an integer)
+ p-value
+
+ """
+ M=n_eq*(n_eq-1)/2.0
+ lrtest=n*(ldetS0-ldetS1)
+ pvalue=stats.chi2.sf(lrtest,M)
+ return(lrtest,int(M),pvalue)
+
+
+
+
+[docs]
+defsur_lmtest(n,n_eq,sig):
+"""
+ Lagrange Multiplier test on off-diagonal elements of Sigma
+
+ Parameters
+ ----------
+ n : int
+ cross-sectional dimension (number of observations for an equation)
+ n_eq : int
+ number of equations
+ sig : array
+ inter-equation covariance matrix for null model (OLS)
+
+ Returns
+ -------
+ (lmtest,M,pvalue) : tuple
+ with value of test statistic (lmtest),
+ degrees of freedom (M, as an integer)
+ p-value
+ """
+ R=sur_corr(sig)
+ tr=np.trace(np.dot(R.T,R))
+ M=n_eq*(n_eq-1)/2.0
+ lmtest=(n/2.0)*(tr-n_eq)
+ pvalue=stats.chi2.sf(lmtest,M)
+ return(lmtest,int(M),pvalue)
+
+
+
+
+[docs]
+defsurLMe(n_eq,WS,bigE,sig):
+"""
+ Lagrange Multiplier test on error spatial autocorrelation in SUR
+
+ Parameters
+ ----------
+ n_eq : int
+ number of equations
+ WS : array
+ spatial weights matrix in sparse form
+ bigE : array
+ n x n_eq matrix of residuals by equation
+ sig : array
+ cross-equation error covariance matrix
+
+ Returns
+ -------
+ (LMe,n_eq,pvalue) : tuple
+ with value of statistic (LMe), degrees
+ of freedom (n_eq) and p-value
+
+ """
+ # spatially lagged residuals
+ WbigE=WS*bigE
+ # score
+ EWE=np.dot(bigE.T,WbigE)
+ sigi=la.inv(sig)
+ SEWE=sigi*EWE
+ # score = SEWE.sum(axis=1)
+ # score.resize(n_eq,1)
+ # note score is column sum of Sig_i * E'WE, a 1 by n_eq row vector
+ # previously stored as column
+ score=SEWE.sum(axis=0)
+ score.resize(1,n_eq)
+
+ # trace terms
+ WW=WS*WS
+ trWW=np.sum(WW.diagonal())
+ WTW=WS.T*WS
+ trWtW=np.sum(WTW.diagonal())
+ # denominator
+ SiS=sigi*sig
+ Tii=trWW*np.identity(n_eq)
+ tSiS=trWtW*SiS
+ denom=Tii+tSiS
+ idenom=la.inv(denom)
+ # test statistic
+ # LMe = np.dot(np.dot(score.T,idenom),score)[0][0]
+ # score is now row vector
+ LMe=np.dot(np.dot(score,idenom),score.T)[0][0]
+ pvalue=stats.chi2.sf(LMe,n_eq)
+ return(LMe,n_eq,pvalue)
+
+
+
+
+[docs]
+defsurLMlag(n_eq,WS,bigy,bigX,bigE,bigYP,sig,varb):
+"""
+ Lagrange Multiplier test on lag spatial autocorrelation in SUR
+
+ Parameters
+ ----------
+ n_eq : int
+ number of equations
+ WS : spatial weights matrix in sparse form
+ bigy : dictionary
+ with y values
+ bigX : dictionary
+ with X values
+ bigE : array
+ n x n_eq matrix of residuals by equation
+ bigYP : array
+ n x n_eq matrix of predicted values by equation
+ sig : array
+ cross-equation error covariance matrix
+ varb : array
+ variance-covariance matrix for b coefficients (inverse of Ibb)
+
+ Returns
+ -------
+ (LMlag,n_eq,pvalue) : tuple
+ with value of statistic (LMlag), degrees
+ of freedom (n_eq) and p-value
+
+ """
+ # Score
+ Y=np.hstack([bigy[r]forrinrange(n_eq)])
+ WY=WS*Y
+ EWY=np.dot(bigE.T,WY)
+ sigi=la.inv(sig)
+ SEWE=sigi*EWY
+ score=SEWE.sum(axis=0)# column sums
+ score.resize(1,n_eq)# score as a row vector
+
+ # I(rho,rho) as partitioned inverse, eq 72
+ # trace terms
+ WW=WS*WS
+ trWW=np.sum(WW.diagonal())# T1
+ WTW=WS.T*WS
+ trWtW=np.sum(WTW.diagonal())# T2
+
+ # I(rho,rho)
+ SiS=sigi*sig
+ Tii=trWW*np.identity(n_eq)# T1It
+ tSiS=trWtW*SiS
+ firstHalf=Tii+tSiS
+ WbigYP=WS*bigYP
+ inner=np.dot(WbigYP.T,WbigYP)
+ secondHalf=sigi*inner
+ Ipp=firstHalf+secondHalf# eq. 75
+
+ # I(b,b) inverse is varb
+
+ # I(b,rho)
+ bp=sigi[0,]*spdot(bigX[0].T,WbigYP)# initialize
+ forrinrange(1,n_eq):
+ bpwork=sigi[r,]*spdot(bigX[r].T,WbigYP)
+ bp=np.vstack((bp,bpwork))
+ # partitioned part
+ i_inner=Ipp-np.dot(np.dot(bp.T,varb),bp)
+ # partitioned inverse of information matrix
+ Ippi=la.inv(i_inner)
+
+ # test statistic
+ LMlag=np.dot(np.dot(score,Ippi),score.T)[0][0]
+ # p-value
+ pvalue=stats.chi2.sf(LMlag,n_eq)
+ return(LMlag,n_eq,pvalue)
+
+
+
+defsur_chow(n_eq,bigK,bSUR,varb):
+"""
+ test on constancy of regression coefficients across equations in
+ a SUR specification
+
+ Note: requires a previous check on constancy of number of coefficients
+ across equations; no other checks are carried out, so it is possible
+ that the results are meaningless if the variables are not listed in
+ the same order in each equation.
+
+ Parameters
+ ----------
+ n_eq : int
+ number of equations
+ bigK : array
+ with the number of variables by equation (includes constant)
+ bSUR : dictionary
+ with the SUR regression coefficients by equation
+ varb : array
+ the variance-covariance matrix for the SUR regression
+ coefficients
+
+ Returns
+ -------
+ test : array
+ a list with for each coefficient (in order) a tuple with the
+ value of the test statistic, the degrees of freedom, and the
+ p-value
+
+ """
+ kr=bigK[0][0]
+ test=[]
+ bb=sur_dict2mat(bSUR)
+ kf=0
+ nr=n_eq
+ df=n_eq-1
+ foriinrange(kr):
+ Ri=buildR1var(i,kr,kf,0,nr)
+ tt,p=wald_test(bb,Ri,np.zeros((df,1)),varb)
+ test.append((tt,df,p))
+ returntest
+
+
+defsur_joinrho(n_eq,bigK,bSUR,varb):
+"""
+ Test on joint significance of spatial autoregressive coefficient in SUR
+
+ Parameters
+ ----------
+ n_eq : int
+ number of equations
+ bigK : array
+ n_eq x 1 array with number of variables by equation
+ (includes constant term, exogenous and endogeneous and
+ spatial lag)
+ bSUR : dictionary
+ with regression coefficients by equation, with
+ the spatial autoregressive term as last
+ varb : array
+ variance-covariance matrix for regression coefficients
+
+ Returns
+ -------
+ : tuple
+ with test statistic, degrees of freedom, p-value
+
+ """
+ bb=sur_dict2mat(bSUR)
+ R=np.zeros((n_eq,varb.shape[0]))
+ q=np.zeros((n_eq,1))
+ kc=-1
+ foriinrange(n_eq):
+ kc=kc+bigK[i]
+ R[i,kc]=1
+ w,p=wald_test(bb,R,q,varb)
+ return(w,n_eq,p)
+
+"""
+Diagnostics for two stage least squares regression estimations.
+
+"""
+
+__author__=(
+ "Luc Anselin luc.anselin@asu.edu, Nicholas Malizia nicholas.malizia@asu.edu "
+)
+
+importnumpyasnp
+fromscipyimportstats
+fromscipy.statsimportpearsonr
+
+__all__=["t_stat","pr2_aspatial","pr2_spatial"]
+
+
+
+[docs]
+deft_stat(reg,z_stat=False):
+"""
+ Calculates the t-statistics (or z-statistics) and associated p-values.
+ :cite:`Greene2003`
+
+ Parameters
+ ----------
+ reg : regression object
+ output instance from a regression model
+ z_stat : boolean
+ If True run z-stat instead of t-stat
+
+ Returns
+ -------
+ ts_result : list of tuples
+ each tuple includes value of t statistic (or z
+ statistic) and associated p-value
+
+ Examples
+ --------
+
+ We first need to import the needed modules. Numpy is needed to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis. The ``diagnostics`` module is used for the tests
+ we will show here and the OLS and TSLS are required to run the models on
+ which we will perform the tests.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg
+ >>> from spreg import OLS
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),'r')
+
+ Before being able to apply the diagnostics, we have to run a model and,
+ for that, we need the input variables. Extract the CRIME column (crime
+ rates) from the DBF file and make it the dependent variable for the
+ regression. Note that PySAL requires this to be an numpy array of shape
+ (n, 1) as opposed to the also common shape of (n, ) that other packages
+ accept.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) and HOVAL (home value) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this model adds a vector of ones to the
+ independent variables passed in, but this can be overridden by passing
+ constant=False.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("HOVAL"))
+ >>> X = np.array(X).T
+
+ Run an OLS regression. Since it is a non-spatial model, all we need is the
+ dependent and the independent variable.
+
+ >>> reg = OLS(y,X)
+
+ Now we can perform a t-statistic on the model:
+
+ >>> testresult = spreg.t_stat(reg)
+ >>> print("%12.12f"%testresult[0][0], "%12.12f"%testresult[0][1], "%12.12f"%testresult[1][0], "%12.12f"%testresult[1][1], "%12.12f"%testresult[2][0], "%12.12f"%testresult[2][1])
+ 14.490373143689 0.000000000000 -4.780496191297 0.000018289595 -2.654408642718 0.010874504910
+
+ We can also use the z-stat. For that, we re-build the model so we consider
+ HOVAL as endogenous, instrument for it using DISCBD and carry out two
+ stage least squares (TSLS) estimation.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+ >>> yd = []
+ >>> yd.append(db.by_col("HOVAL"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ Once the variables are read as different objects, we are good to run the
+ model.
+
+ >>> reg = spreg.TSLS(y, X, yd, q)
+
+ With the output of the TSLS regression, we can perform a z-statistic:
+
+ >>> testresult = spreg.t_stat(reg, z_stat=True)
+ >>> print("%12.10f"%testresult[0][0], "%12.10f"%testresult[0][1], "%12.10f"%testresult[1][0], "%12.10f"%testresult[1][1], "%12.10f"%testresult[2][0], "%12.10f"%testresult[2][1])
+ 5.8452644705 0.0000000051 0.3676015668 0.7131703463 -1.9946891308 0.0460767956
+ """
+
+ k=reg.k# (scalar) number of ind. vas (includes constant)
+ n=reg.n# (scalar) number of observations
+ vm=reg.vm# (array) coefficients of variance matrix (k x k)
+ betas=reg.betas# (array) coefficients of the regressors (1 x k)
+ variance=vm.diagonal()
+ tStat=betas.reshape(
+ len(betas),
+ )/np.sqrt(variance)
+ ts_result=[]
+ fortintStat:
+ ifz_stat:
+ ts_result.append((t,stats.norm.sf(abs(t))*2))
+ else:
+ ts_result.append((t,stats.t.sf(abs(t),n-k)*2))
+ returnts_result
+
+
+
+defpr2_aspatial(tslsreg):
+"""
+ Calculates the pseudo r^2 for the two stage least squares regression.
+
+ Parameters
+ ----------
+ tslsreg : two stage least squares regression object
+ output instance from a two stage least squares
+ regression model
+
+ Returns
+ -------
+ pr2_result : float
+ value of the squared pearson correlation between
+ the y and tsls-predicted y vectors
+
+ Examples
+ --------
+
+ We first need to import the needed modules. Numpy is needed to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis. The TSLS is required to run the model on
+ which we will perform the tests.
+
+ >>> import numpy as np
+ >>> from spreg import TSLS, pr2_aspatial
+ >>> import libpysal
+ >>> from libpysal import examples
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),'r')
+
+ Before being able to apply the diagnostics, we have to run a model and,
+ for that, we need the input variables. Extract the CRIME column (crime
+ rates) from the DBF file and make it the dependent variable for the
+ regression. Note that PySAL requires this to be an numpy array of shape
+ (n, 1) as opposed to the also common shape of (n, ) that other packages
+ accept.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this model adds a vector of ones to the
+ independent variables passed in, but this can be overridden by passing
+ constant=False.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+
+ In this case, we consider HOVAL (home value) as an endogenous regressor,
+ so we acknowledge that by reading it in a different category.
+
+ >>> yd = []
+ >>> yd.append(db.by_col("HOVAL"))
+ >>> yd = np.array(yd).T
+
+ In order to properly account for the endogeneity, we have to pass in the
+ instruments. Let us consider DISCBD (distance to the CBD) is a good one:
+
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ Now we are good to run the model. It is an easy one line task.
+
+ >>> reg = TSLS(y, X, yd, q=q)
+
+ In order to perform the pseudo R^2, we pass the regression object to the
+ function and we are done!
+
+ >>> result = pr2_aspatial(reg)
+ >>> print("%1.6f"%result)
+ 0.279361
+
+ """
+
+ y=tslsreg.y
+ predy=tslsreg.predy
+ pr=pearsonr(y.flatten(),predy.flatten())[0]
+ pr2_result=float(pr**2)
+ returnpr2_result
+
+
+defpr2_spatial(tslsreg):
+"""
+ Calculates the pseudo r^2 for the spatial two stage least squares
+ regression.
+
+ Parameters
+ ----------
+ stslsreg : spatial two stage least squares regression object
+ output instance from a spatial two stage least
+ squares regression model
+
+ Returns
+ -------
+ pr2_result : float
+ value of the squared pearson correlation between
+ the y and stsls-predicted y vectors
+
+ Examples
+ --------
+
+ We first need to import the needed modules. Numpy is needed to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis. The GM_Lag is required to run the model on
+ which we will perform the tests and the ``spreg.diagnostics`` module
+ contains the function with the test.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> import spreg as D
+ >>> from spreg import GM_Lag
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(examples.get_path("columbus.dbf"),'r')
+
+ Extract the HOVAL column (home value) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vectors from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this model adds a vector of ones to the
+ independent variables passed in, but this can be overridden by passing
+ constant=False.
+
+ >>> X = np.array(db.by_col("INC"))
+ >>> X = np.reshape(X, (49,1))
+
+ In this case, we consider CRIME (crime rates) as an endogenous regressor,
+ so we acknowledge that by reading it in a different category.
+
+ >>> yd = np.array(db.by_col("CRIME"))
+ >>> yd = np.reshape(yd, (49,1))
+
+ In order to properly account for the endogeneity, we have to pass in the
+ instruments. Let us consider DISCBD (distance to the CBD) is a good one:
+
+ >>> q = np.array(db.by_col("DISCBD"))
+ >>> q = np.reshape(q, (49,1))
+
+ Since this test has a spatial component, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ Now we are good to run the spatial lag model. Make sure you pass all the
+ parameters correctly and, if desired, pass the names of the variables as
+ well so when you print the summary (reg.summary) they are included:
+
+ >>> reg = GM_Lag(y, X, w=w, yend=yd, q=q, w_lags=2, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus')
+
+ Once we have a regression object, we can perform the spatial version of
+ the pesudo R^2. It is as simple as one line!
+
+ >>> result = pr2_spatial(reg)
+ >>> print("%1.6f"%result)
+ 0.299649
+
+ """
+
+ y=tslsreg.y
+ predy_e=tslsreg.predy_e
+ pr=pearsonr(y.flatten(),predy_e.flatten())[0]
+ pr2_result=float(pr**2)
+ returnpr2_result
+
+
+def_test():
+ importdoctest
+
+ doctest.testmod()
+
+
+if__name__=="__main__":
+ _test()
+
+"""
+Spatial Error Models module
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, \
+ Daniel Arribas-Bel darribas@asu.edu, \
+ Pedro V. Amaral pedro.amaral@asu.edu"
+
+importnumpyasnp
+fromnumpyimportlinalgasla
+from.importolsasOLS
+from.utilsimport(
+ set_endog,
+ sp_att,
+ optim_moments,
+ get_spFilter,
+ get_lags,
+ spdot,
+ RegressionPropsY,
+ set_warn,
+)
+from.importtwoslsasTSLS
+from.importuser_outputasUSER
+importpandasaspd
+from.outputimportoutput,_spat_pseudo_r2
+from.error_sp_hetimportGM_Error_Het,GM_Endog_Error_Het,GM_Combo_Het
+from.error_sp_homimportGM_Error_Hom,GM_Endog_Error_Hom,GM_Combo_Hom
+fromitertoolsimportcompress
+
+
+__all__=["GMM_Error","GM_Error","GM_Endog_Error","GM_Combo"]
+
+
+classBaseGM_Error(RegressionPropsY):
+"""
+ GMM method for a spatial error model (note: no consistency checks
+ diagnostics or constant added); based on Kelejian and Prucha
+ (1998, 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`.
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+
+ Examples
+ --------
+
+ >>> import libpysal
+ >>> import numpy as np
+ >>> import spreg
+ >>> dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array([dbf.by_col('HOVAL')]).T
+ >>> x = np.array([dbf.by_col('INC'), dbf.by_col('CRIME')]).T
+ >>> x = np.hstack((np.ones(y.shape),x))
+ >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read()
+ >>> w.transform='r'
+ >>> model = spreg.error_sp.BaseGM_Error(y, x, w=w.sparse)
+ >>> np.around(model.betas, decimals=4)
+ array([[47.6946],
+ [ 0.7105],
+ [-0.5505],
+ [ 0.3257]])
+ """
+
+ def__init__(self,y,x,w,hard_bound=False):
+ # 1a. OLS --> \tilde{betas}
+ ols=OLS.BaseOLS(y=y,x=x)
+ self.n,self.k=ols.x.shape
+ self.x=ols.x
+ self.y=ols.y
+
+ # 1b. GMM --> \tilde{\lambda1}
+ moments=_momentsGM_Error(w,ols.u)
+ lambda1=optim_moments(moments,hard_bound=hard_bound)
+
+ # 2a. OLS -->\hat{betas}
+ xs=get_spFilter(w,lambda1,self.x)
+ ys=get_spFilter(w,lambda1,self.y)
+ ols2=OLS.BaseOLS(y=ys,x=xs)
+
+ # Output
+ self.predy=spdot(self.x,ols2.betas)
+ self.u=y-self.predy
+ self.betas=np.vstack((ols2.betas,np.array([[lambda1]])))
+ self.sig2=ols2.sig2n
+ self.e_filtered=self.u-lambda1*w*self.u
+
+ self.vm=self.sig2*ols2.xtxi
+ se_betas=np.sqrt(self.vm.diagonal())
+ self._cache={}
+
+
+
+[docs]
+classGM_Error(BaseGM_Error):
+"""
+ GMM method for a spatial error model, with results and diagnostics; based
+ on Kelejian and Prucha (1998, 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : pysal W object
+ Spatial weights object (always needed)
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import libpysal
+ >>> import numpy as np
+ >>> from spreg import GM_Error
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the HOVAL column (home values) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array([dbf.by_col('HOVAL')]).T
+
+ Extract CRIME (crime) and INC (income) vectors from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this class adds a vector of ones to the
+ independent variables passed in.
+
+ >>> names_to_extract = ['INC', 'CRIME']
+ >>> x = np.array([dbf.by_col(name) for name in names_to_extract]).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will use
+ ``columbus.gal``, which contains contiguity relationships between the
+ observations in the Columbus dataset we are using throughout this example.
+ Note that, in order to read the file, not only to open it, we need to
+ append '.read()' at the end of the command.
+
+ >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read()
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, his allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform='r'
+
+ We are all set with the preliminars, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> model = GM_Error(y, x, w=w, name_y='hoval', name_x=['income', 'crime'], name_ds='columbus')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. Note that because we are running the classical GMM error
+ model from 1998/99, the spatial parameter is obtained as a point estimate, so
+ although you get a value for it (there are for coefficients under
+ model.betas), you cannot perform inference on it (there are only three
+ values in model.se_betas).
+
+ >>> print(model.name_x)
+ ['CONSTANT', 'income', 'crime', 'lambda']
+ >>> np.around(model.betas, decimals=4)
+ array([[47.6946],
+ [ 0.7105],
+ [-0.5505],
+ [ 0.3257]])
+ >>> np.around(model.std_err, decimals=4)
+ array([12.412 , 0.5044, 0.1785])
+ >>> np.around(model.z_stat, decimals=6) #doctest: +SKIP
+ array([[ 3.84261100e+00, 1.22000000e-04],
+ [ 1.40839200e+00, 1.59015000e-01],
+ [ -3.08424700e+00, 2.04100000e-03]])
+ >>> round(model.sig2,4)
+ 198.5596
+
+ """
+
+
+
+
+
+classBaseGM_Endog_Error(RegressionPropsY):
+"""
+ GMM method for a spatial error model with endogenous variables (note: no
+ consistency checks, diagnostics or constant added); based on Kelejian and
+ Prucha (1998, 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`.
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ z : array
+ nxk array of variables (combination of x and yend)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+
+ Examples
+ --------
+
+ >>> import libpysal
+ >>> import numpy as np
+ >>> import spreg
+ >>> dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array([dbf.by_col('CRIME')]).T
+ >>> x = np.array([dbf.by_col('INC')]).T
+ >>> x = np.hstack((np.ones(y.shape),x))
+ >>> yend = np.array([dbf.by_col('HOVAL')]).T
+ >>> q = np.array([dbf.by_col('DISCBD')]).T
+ >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read()
+ >>> w.transform='r'
+ >>> model = spreg.error_sp.BaseGM_Endog_Error(y, x, yend, q, w=w.sparse)
+ >>> np.around(model.betas, decimals=4)
+ array([[82.5723],
+ [ 0.581 ],
+ [-1.4481],
+ [ 0.3499]])
+
+ """
+
+ def__init__(self,y,x,yend,q,w,hard_bound=False):
+ # 1a. TSLS --> \tilde{betas}
+ tsls=TSLS.BaseTSLS(y=y,x=x,yend=yend,q=q)
+ self.n,self.k=tsls.z.shape
+ self.x=tsls.x
+ self.y=tsls.y
+ self.yend,self.z=tsls.yend,tsls.z
+
+ # 1b. GMM --> \tilde{\lambda1}
+ moments=_momentsGM_Error(w,tsls.u)
+ lambda1=optim_moments(moments,hard_bound=hard_bound)
+
+ # 2a. 2SLS -->\hat{betas}
+ xs=get_spFilter(w,lambda1,self.x)
+ ys=get_spFilter(w,lambda1,self.y)
+ yend_s=get_spFilter(w,lambda1,self.yend)
+ tsls2=TSLS.BaseTSLS(ys,xs,yend_s,h=tsls.h)
+
+ # Output
+ self.betas=np.vstack((tsls2.betas,np.array([[lambda1]])))
+ self.predy=spdot(tsls.z,tsls2.betas)
+ self.u=y-self.predy
+ self.sig2=float(np.dot(tsls2.u.T,tsls2.u))/self.n
+ self.e_filtered=self.u-lambda1*w*self.u
+ self.vm=self.sig2*tsls2.varb
+ self._cache={}
+
+
+
+[docs]
+classGM_Endog_Error(BaseGM_Endog_Error):
+"""
+ GMM method for a spatial error model with endogenous variables, with
+ results and diagnostics; based on Kelejian and Prucha (1998,
+ 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : pysal W object
+ Spatial weights object (always needed)
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ z : array
+ nxk array of variables (combination of x and yend)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ sig2 : float
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import libpysal
+ >>> import numpy as np
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> dbf = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),'r')
+
+ Extract the CRIME column (crime rates) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array([dbf.by_col('CRIME')]).T
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this model adds a vector of ones to the
+ independent variables passed in.
+
+ >>> x = np.array([dbf.by_col('INC')]).T
+
+ In this case we consider HOVAL (home value) is an endogenous regressor.
+ We tell the model that this is so by passing it in a different parameter
+ from the exogenous variables (x).
+
+ >>> yend = np.array([dbf.by_col('HOVAL')]).T
+
+ Because we have endogenous variables, to obtain a correct estimate of the
+ model, we need to instrument for HOVAL. We use DISCBD (distance to the
+ CBD) for this and hence put it in the instruments parameter, 'q'.
+
+ >>> q = np.array([dbf.by_col('DISCBD')]).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will use
+ ``columbus.gal``, which contains contiguity relationships between the
+ observations in the Columbus dataset we are using throughout this example.
+ Note that, in order to read the file, not only to open it, we need to
+ append '.read()' at the end of the command.
+
+ >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read()
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform='r'
+
+ We are all set with the preliminars, we are good to run the model. In this
+ case, we will need the variables (exogenous and endogenous), the
+ instruments and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Endog_Error
+ >>> model = GM_Endog_Error(y, x, yend, q, w=w, name_x=['inc'], name_y='crime', name_yend=['hoval'], name_q=['discbd'], name_ds='columbus')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. Note that because we are running the classical GMM error
+ model from 1998/99, the spatial parameter is obtained as a point estimate, so
+ although you get a value for it (there are for coefficients under
+ model.betas), you cannot perform inference on it (there are only three
+ values in model.se_betas). Also, this regression uses a two stage least
+ squares estimation method that accounts for the endogeneity created by the
+ endogenous variables included.
+
+ >>> print(model.name_z)
+ ['CONSTANT', 'inc', 'hoval', 'lambda']
+ >>> np.around(model.betas, decimals=4)
+ array([[82.5723],
+ [ 0.581 ],
+ [-1.4481],
+ [ 0.3499]])
+ >>> np.around(model.std_err, decimals=4)
+ array([16.1382, 1.3545, 0.7862])
+
+ """
+
+
+[docs]
+classGM_Combo(BaseGM_Endog_Error):
+"""
+ GMM method for a spatial lag and error model with endogenous variables,
+ with results and diagnostics; based on Kelejian and Prucha (1998,
+ 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : pysal W object
+ Spatial weights object (always needed)
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the General Nesting
+ Spatial Model (GNSM) type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q)
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ z : array
+ nxk array of variables (combination of x and yend)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ sig2 : float
+ Sigma squared used in computations (based on filtered
+ residuals)
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import GM_Combo
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),'r')
+
+ Extract the CRIME column (crime rates) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this model adds a vector of ones to the
+ independent variables passed in.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ The Combo class runs an SARAR model, that is a spatial lag+error model.
+ In this case we will run a simple version of that, where we have the
+ spatial effects as well as exogenous variables. Since it is a spatial
+ model, we have to pass in the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> reg = GM_Combo(y, X, w=w, name_y='crime', name_x=['income'], name_ds='columbus')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. Note that because we are running the classical GMM error
+ model from 1998/99, the spatial parameter is obtained as a point estimate, so
+ although you get a value for it (there are for coefficients under
+ model.betas), you cannot perform inference on it (there are only three
+ values in model.se_betas). Also, this regression uses a two stage least
+ squares estimation method that accounts for the endogeneity created by the
+ spatial lag of the dependent variable. We can check the betas:
+
+ >>> print(reg.name_z)
+ ['CONSTANT', 'income', 'W_crime', 'lambda']
+ >>> print(np.around(np.hstack((reg.betas[:-1],np.sqrt(reg.vm.diagonal()).reshape(3,1))),3))
+ [[39.059 11.86 ]
+ [-1.404 0.391]
+ [ 0.467 0.2 ]]
+
+ And lambda:
+
+ >>> print('lambda: ', np.around(reg.betas[-1], 3))
+ lambda: [-0.048]
+
+ This class also allows the user to run a spatial lag+error model with the
+ extra feature of including non-spatial endogenous regressors. This means
+ that, in addition to the spatial lag and error, we consider some of the
+ variables on the right-hand side of the equation as endogenous and we
+ instrument for this. As an example, we will include HOVAL (home value) as
+ endogenous and will instrument with DISCBD (distance to the CSB). We first
+ need to read in the variables:
+
+ >>> yd = []
+ >>> yd.append(db.by_col("HOVAL"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ And then we can run and explore the model analogously to the previous combo:
+
+ >>> reg = GM_Combo(y, X, yd, q, w=w, name_x=['inc'], name_y='crime', name_yend=['hoval'], name_q=['discbd'], name_ds='columbus')
+ >>> print(reg.name_z)
+ ['CONSTANT', 'inc', 'hoval', 'W_crime', 'lambda']
+ >>> names = np.array(reg.name_z).reshape(5,1)
+ >>> print(np.hstack((names[0:4,:], np.around(np.hstack((reg.betas[:-1], np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))))
+ [['CONSTANT' '50.0944' '14.3593']
+ ['inc' '-0.2552' '0.5667']
+ ['hoval' '-0.6885' '0.3029']
+ ['W_crime' '0.4375' '0.2314']]
+
+ >>> print('lambda: ', np.around(reg.betas[-1], 3))
+ lambda: [0.254]
+
+ """
+
+
+[docs]
+classGMM_Error(
+ GM_Error,
+ GM_Endog_Error,
+ GM_Combo,
+ GM_Error_Het,
+ GM_Endog_Error_Het,
+ GM_Combo_Het,
+ GM_Error_Hom,
+ GM_Endog_Error_Hom,
+ GM_Combo_Hom,
+):
+"""
+ Wrapper function to call any of the GMM methods for a spatial error model available in spreg
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : pysal W object
+ Spatial weights object (always needed)
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable (if any)
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (if any)
+ (note: this should not contain any variables from x)
+ estimator : string
+ Choice of estimator to be used. Options are: 'het', which
+ is robust to heteroskedasticity, 'hom', which assumes
+ homoskedasticity, and 'kp98', which does not provide
+ inference on the spatial parameter for the error term.
+ add_wy : boolean
+ If True, then a spatial lag of the dependent variable is included.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error or GNSM type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ spat_diag : boolean, ignored, included for compatibility with other models
+ **kwargs : keywords
+ Additional arguments to pass on to the estimators.
+ See the specific functions for details on what can be used.
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+ name_yend : list of strings (optional)
+ Names of endogenous variables for use in output
+ name_z : list of strings (optional)
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings (optional)
+ Names of external instruments
+ name_h : list of strings (optional)
+ Names of all instruments used in ouput
+
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``libpysal`` to
+ handle the weights and file management.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ Since we want to run a spatial error model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ The GMM_Error class can run error models and SARAR models, that is a spatial lag+error model.
+ In this example we will run a simple version of the latter, where we have the
+ spatial effects as well as exogenous variables. Since it is a spatial
+ model, we have to pass in the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GMM_Error
+ >>> model = GMM_Error(y, x, w=w, add_wy=True, name_y=y_var, name_x=x_var, name_ds='NAT')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them.
+
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 CONSTANT 2.176007 1.115807 1.950165 0.051156
+ 1 PS90 1.108054 0.207964 5.328096 0.0
+ 2 UE90 0.664362 0.061294 10.83893 0.0
+ 3 W_HR90 -0.066539 0.154395 -0.430964 0.666494
+ 4 lambda 0.765087 0.04268 17.926245 0.0
+
+ This class also allows the user to run a spatial lag+error model with the
+ extra feature of including non-spatial endogenous regressors. This means
+ that, in addition to the spatial lag and error, we consider some of the
+ variables on the right-hand side of the equation as endogenous and we
+ instrument for this. In this case we consider RD90 (resource deprivation)
+ as an endogenous regressor. We use FP89 (families below poverty)
+ for this and hence put it in the instruments parameter, 'q'.
+
+ >>> yd_var = ['RD90']
+ >>> yd = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ And then we can run and explore the model analogously to the previous combo:
+
+ >>> model = GMM_Error(y, x, yend=yd, q=q, w=w, add_wy=True, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_ds='NAT')
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 CONSTANT 5.44035 0.560476 9.706652 0.0
+ 1 PS90 1.427042 0.1821 7.836572 0.0
+ 2 UE90 -0.075224 0.050031 -1.503544 0.132699
+ 3 RD90 3.316266 0.261269 12.692924 0.0
+ 4 W_HR90 0.200314 0.057433 3.487777 0.000487
+ 5 lambda 0.136933 0.070098 1.953457 0.050765
+
+ The class also allows for estimating a GNS model by adding spatial lags of the exogenous variables, using the argument slx_lags:
+
+ >>> model = GMM_Error(y, x, w=w, add_wy=True, slx_lags=1, name_y=y_var, name_x=x_var, name_ds='NAT')
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 CONSTANT -0.554756 0.551765 -1.00542 0.314695
+ 1 PS90 1.09369 0.225895 4.841583 0.000001
+ 2 UE90 0.697393 0.082744 8.428291 0.0
+ 3 W_PS90 -1.533378 0.396651 -3.865811 0.000111
+ 4 W_UE90 -1.107944 0.33523 -3.305028 0.00095
+ 5 W_HR90 1.529277 0.389354 3.927728 0.000086
+ 6 lambda -0.917928 0.079569 -11.53625 0.0
+
+
+ """
+
+
+"""
+Spatial Error with Heteroskedasticity family of models
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, \
+ Pedro V. Amaral pedro.amaral@asu.edu, \
+ Daniel Arribas-Bel darribas@asu.edu, \
+ David C. Folch david.folch@asu.edu \
+ Ran Wei rwei5@asu.edu"
+
+importnumpyasnp
+importnumpy.linalgasla
+from.importolsasOLS
+from.importuser_outputasUSER
+from.importtwoslsasTSLS
+from.importutilsasUTILS
+from.utilsimportRegressionPropsY,spdot,set_endog,sphstack,set_warn,get_lags
+fromscipyimportsparseasSP
+fromlibpysal.weights.spatial_lagimportlag_spatial
+importpandasaspd
+from.outputimportoutput,_summary_iteration,_spat_pseudo_r2
+fromitertoolsimportcompress
+
+__all__=["GM_Error_Het","GM_Endog_Error_Het","GM_Combo_Het"]
+
+
+classBaseGM_Error_Het(RegressionPropsY):
+
+"""
+ GMM method for a spatial error model with heteroskedasticity (note: no
+ consistency checks, diagnostics or constant added); based on
+ :cite:`Arraiz2010`, following :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from :cite:`Arraiz2010`.
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ xtx : float
+ X'X
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("CRIME"))
+ >>> X = np.array(X).T
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = 'r'
+ >>> reg = spreg.error_sp_het.BaseGM_Error_Het(y, X, w.sparse, step1c=True)
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[47.9963 11.479 ]
+ [ 0.7105 0.3681]
+ [-0.5588 0.1616]
+ [ 0.4118 0.168 ]]
+ """
+
+ def__init__(self,y,x,w,max_iter=1,epsilon=0.00001,step1c=False,hard_bound=False):
+
+ self.step1c=step1c
+ # 1a. OLS --> \tilde{betas}
+ ols=OLS.BaseOLS(y=y,x=x)
+ self.x,self.y,self.n,self.k,self.xtx=ols.x,ols.y,ols.n,ols.k,ols.xtx
+ wA1=UTILS.get_A1_het(w)
+
+ # 1b. GMM --> \tilde{\lambda1}
+ moments=UTILS._moments2eqs(wA1,w,ols.u)
+ lambda1=UTILS.optim_moments(moments)
+
+ ifstep1c:
+ # 1c. GMM --> \tilde{\lambda2}
+ sigma=get_psi_sigma(w,ols.u,lambda1)
+ vc1=get_vc_het(w,wA1,sigma)
+ lambda2=UTILS.optim_moments(moments,vc1)
+ else:
+ lambda2=lambda1
+
+ lambda_old=lambda2
+
+ self.iteration,eps=0,1
+ whileself.iteration<max_iterandeps>epsilon:
+ # 2a. reg -->\hat{betas}
+ xs=UTILS.get_spFilter(w,lambda_old,self.x)
+ ys=UTILS.get_spFilter(w,lambda_old,self.y)
+ ols_s=OLS.BaseOLS(y=ys,x=xs)
+ self.predy=spdot(self.x,ols_s.betas)
+ self.u=self.y-self.predy
+
+ # 2b. GMM --> \hat{\lambda}
+ sigma_i=get_psi_sigma(w,self.u,lambda_old)
+ vc_i=get_vc_het(w,wA1,sigma_i)
+ moments_i=UTILS._moments2eqs(wA1,w,self.u)
+ lambda3=UTILS.optim_moments(moments_i,vc_i)
+ eps=abs(lambda3-lambda_old)
+ lambda_old=lambda3
+ self.iteration+=1
+
+ self.iter_stop=UTILS.iter_msg(self.iteration,max_iter)
+ ifhard_bound:
+ ifabs(lambda3)>=0.99:
+ raiseException("Spatial error parameter was outside the bounds of -0.99 and 0.99")
+ else:
+ ifabs(lambda3)>=0.99:
+ set_warn(self,"Spatial error parameter was outside the bounds of -0.99 and 0.99")
+
+ sigma=get_psi_sigma(w,self.u,lambda3)
+ vc3=get_vc_het(w,wA1,sigma)
+ self.vm=get_vm_het(moments_i[0],lambda3,self,w,vc3)
+ self.betas=np.vstack((ols_s.betas,lambda3))
+ self.e_filtered=self.u-lambda3*w*self.u
+ self._cache={}
+
+
+
+[docs]
+classGM_Error_Het(BaseGM_Error_Het):
+"""
+ GMM method for a spatial error model with heteroskedasticity, with results
+ and diagnostics; based on :cite:`Arraiz2010`, following
+ :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : pysal W object
+ Spatial weights object
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from :cite:`Arraiz2010`.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ vm : array
+ Variance covariance matrix (kxk)
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ xtx : float
+ :math:`X'X`
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the HOVAL column (home values) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) and CRIME (crime) vectors from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this class adds a vector of ones to the
+ independent variables passed in.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("CRIME"))
+ >>> X = np.array(X).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, his allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Error_Het
+ >>> reg = GM_Error_Het(y, X, w=w, step1c=True, name_y='home value', name_x=['income', 'crime'], name_ds='columbus')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that explicitly accounts
+ for heteroskedasticity and that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter.
+
+ >>> print(reg.name_x)
+ ['CONSTANT', 'income', 'crime', 'lambda']
+
+ Hence, we find the same number of betas as of standard errors,
+ which we calculate taking the square root of the diagonal of the
+ variance-covariance matrix:
+
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[47.9963 11.479 ]
+ [ 0.7105 0.3681]
+ [-0.5588 0.1616]
+ [ 0.4118 0.168 ]]
+
+ Alternatively, we can have a summary of the output by typing:
+ print(reg.summary)
+
+ """
+
+
+
+
+
+classBaseGM_Endog_Error_Het(RegressionPropsY):
+
+"""
+ GMM method for a spatial error model with heteroskedasticity and
+ endogenous variables (note: no consistency checks, diagnostics or constant
+ added); based on :cite:`Arraiz2010`, following :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from :cite:`Arraiz2010`.
+ inv_method : string
+ If "power_exp", then compute inverse using the power
+ expansion. If "true_inv", then compute the true inverse.
+ Note that true_inv will fail for large n.
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ hth : float
+ :math:`H'H`
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> yd = []
+ >>> yd.append(db.by_col("CRIME"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = 'r'
+ >>> reg = spreg.error_sp_het.BaseGM_Endog_Error_Het(y, X, yd, q, w=w.sparse, step1c=True)
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[55.3971 28.8901]
+ [ 0.4656 0.7731]
+ [-0.6704 0.468 ]
+ [ 0.4114 0.1777]]
+ """
+
+ def__init__(
+ self,
+ y,
+ x,
+ yend,
+ q,
+ w,
+ max_iter=1,
+ epsilon=0.00001,
+ step1c=False,
+ inv_method="power_exp",
+ hard_bound=False,
+ ):
+
+ self.step1c=step1c
+ # 1a. reg --> \tilde{betas}
+ tsls=TSLS.BaseTSLS(y=y,x=x,yend=yend,q=q)
+
+ ifabs(tsls.betas[-1])<=0.9:
+ pass
+ else:
+ iftsls.betas[-1]<-0.9:
+ tsls.betas[-1]=-0.9
+ else:
+ tsls.betas[-1]=0.9
+ tsls.u=tsls.y-spdot(tsls.z,tsls.betas)
+
+ self.x,self.z,self.h,self.y=tsls.x,tsls.z,tsls.h,tsls.y
+ self.yend,self.q,self.n,self.k,self.hth=(
+ tsls.yend,
+ tsls.q,
+ tsls.n,
+ tsls.k,
+ tsls.hth,
+ )
+ wA1=UTILS.get_A1_het(w)
+
+ # 1b. GMM --> \tilde{\lambda1}
+ moments=UTILS._moments2eqs(wA1,w,tsls.u)
+ lambda1=UTILS.optim_moments(moments)
+
+ ifstep1c:
+ # 1c. GMM --> \tilde{\lambda2}
+ self.u=tsls.u
+ zs=UTILS.get_spFilter(w,lambda1,self.z)
+ vc1=get_vc_het_tsls(
+ w,wA1,self,lambda1,tsls.pfora1a2,zs,inv_method,filt=False
+ )
+ lambda2=UTILS.optim_moments(moments,vc1)
+ else:
+ lambda2=lambda1
+
+ # Forcing the 1st step lambda to be in the range [-0.9, 0.9] to avoid perfect collinearity in step 2 in case of SLX-Error or GNS models
+ #if lambda2 > 0.9:
+ # lambda_old = 0.9
+ #elif lambda2 < -0.9:
+ # lambda_old = -0.9
+ #else:
+ lambda_old=lambda2
+
+ self.iteration,eps=0,1
+ whileself.iteration<max_iterandeps>epsilon:
+ # 2a. reg -->\hat{betas}
+ xs=UTILS.get_spFilter(w,lambda_old,self.x)
+ ys=UTILS.get_spFilter(w,lambda_old,self.y)
+ yend_s=UTILS.get_spFilter(w,lambda_old,self.yend)
+ tsls_s=TSLS.BaseTSLS(ys,xs,yend_s,h=self.h)
+
+ ifabs(tsls_s.betas[-1])<=0.9:
+ pass
+ else:
+ iftsls_s.betas[-1]<-0.9:
+ tsls_s.betas[-1]=-0.9
+ else:
+ tsls_s.betas[-1]=0.9
+ tsls_s.u=tsls_s.y-spdot(tsls_s.z,tsls_s.betas)
+
+ self.predy=spdot(self.z,tsls_s.betas)
+ self.u=self.y-self.predy
+
+ # 2b. GMM --> \hat{\lambda}
+ vc2=get_vc_het_tsls(
+ w,
+ wA1,
+ self,
+ lambda_old,
+ tsls_s.pfora1a2,
+ sphstack(xs,yend_s),
+ inv_method,
+ )
+ moments_i=UTILS._moments2eqs(wA1,w,self.u)
+ lambda3=UTILS.optim_moments(moments_i,vc2)
+
+ #if abs(lambda3) <= 0.9:
+ # pass
+ #elif lambda3 > 0.9:
+ # lambda3 = 0.9
+ #elif lambda3 < -0.9:
+ # lambda3 = -0.9
+
+ eps=abs(lambda3-lambda_old)
+ lambda_old=lambda3
+ self.iteration+=1
+
+ self.iter_stop=UTILS.iter_msg(self.iteration,max_iter)
+ ifhard_bound:
+ ifabs(lambda3)>=0.99:
+ raiseException("Spatial error parameter was outside the bounds of -0.99 and 0.99")
+ ifabs(tsls_s.betas[-1])>=0.99:
+ raiseException("Spatial lag parameter was outside the bounds of -0.99 and 0.99")
+ else:
+ ifabs(lambda3)>=0.99:
+ set_warn(self,"Spatial error parameter was outside the bounds of -0.99 and 0.99")
+ ifabs(tsls_s.betas[-1])>=0.99:
+ set_warn(self,"Spatial lag parameter was outside the bounds of -0.99 and 0.99")
+
+ zs=UTILS.get_spFilter(w,lambda3,self.z)
+ P=get_P_hat(self,tsls.hthi,zs)
+ vc3=get_vc_het_tsls(w,wA1,self,lambda3,P,zs,inv_method,save_a1a2=True)
+ self.vm=get_Omega_GS2SLS(w,lambda3,self,moments_i[0],vc3,P)
+ self.betas=np.vstack((tsls_s.betas,lambda3))
+ self.e_filtered=self.u-lambda3*w*self.u
+ self._cache={}
+
+
+
+[docs]
+classGM_Endog_Error_Het(BaseGM_Endog_Error_Het):
+
+"""
+ GMM method for a spatial error model with heteroskedasticity and
+ endogenous variables, with results and diagnostics; based on
+ :cite:`Arraiz2010`, following :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : pysal W object
+ Spatial weights object
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from :cite:`Arraiz2010`.
+ inv_method : string
+ If "power_exp", then compute inverse using the power
+ expansion. If "true_inv", then compute the true inverse.
+ Note that true_inv will fail for large n.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+ hth : float
+ :math:`H'H`
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import GM_Endog_Error_Het
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the HOVAL column (home values) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this class adds a vector of ones to the
+ independent variables passed in.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+
+ In this case we consider CRIME (crime rates) is an endogenous regressor.
+ We tell the model that this is so by passing it in a different parameter
+ from the exogenous variables (x).
+
+ >>> yd = []
+ >>> yd.append(db.by_col("CRIME"))
+ >>> yd = np.array(yd).T
+
+ Because we have endogenous variables, to obtain a correct estimate of the
+ model, we need to instrument for CRIME. We use DISCBD (distance to the
+ CBD) for this and hence put it in the instruments parameter, 'q'.
+
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, his allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables (exogenous and endogenous), the
+ instruments and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> reg = GM_Endog_Error_Het(y, X, yd, q, w=w, step1c=True, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that explicitly accounts
+ for heteroskedasticity and that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter. Hence, we find the same number of betas as of standard errors,
+ which we calculate taking the square root of the diagonal of the
+ variance-covariance matrix:
+
+ >>> print(reg.name_z)
+ ['CONSTANT', 'inc', 'crime', 'lambda']
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[55.3971 28.8901]
+ [ 0.4656 0.7731]
+ [-0.6704 0.468 ]
+ [ 0.4114 0.1777]]
+
+ """
+
+
+
+
+
+classBaseGM_Combo_Het(BaseGM_Endog_Error_Het):
+
+"""
+ GMM method for a spatial lag and error model with heteroskedasticity and
+ endogenous variables (note: no consistency checks, diagnostics or constant
+ added); based on :cite:`Arraiz2010`, following :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from :cite:`Arraiz2010`.
+ inv_method : string
+ If "power_exp", then compute inverse using the power
+ expansion. If "true_inv", then compute the true inverse.
+ Note that true_inv will fail for large n.
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ hth : float
+ :math:`H'H`
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = 'r'
+ >>> w_lags = 1
+ >>> yd2, q2 = spreg.set_endog(y, X, w, None, None, w_lags, True)
+ >>> X = np.hstack((np.ones(y.shape),X))
+
+ Example only with spatial lag
+
+ >>> reg = spreg.error_sp_het.BaseGM_Combo_Het(y, X, yend=yd2, q=q2, w=w.sparse, step1c=True)
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[ 9.9753 14.1435]
+ [ 1.5742 0.374 ]
+ [ 0.1535 0.3978]
+ [ 0.2103 0.3924]]
+
+ Example with both spatial lag and other endogenous variables
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+ >>> yd = []
+ >>> yd.append(db.by_col("CRIME"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+ >>> yd2, q2 = spreg.set_endog(y, X, w, yd, q, w_lags, True)
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> reg = spreg.error_sp_het.BaseGM_Combo_Het(y, X, yd2, q2, w=w.sparse, step1c=True)
+ >>> betas = np.array([['CONSTANT'],['inc'],['crime'],['lag_hoval'],['lambda']])
+ >>> print(np.hstack((betas, np.around(np.hstack((reg.betas, np.sqrt(reg.vm.diagonal()).reshape(5,1))),5))))
+ [['CONSTANT' '113.91292' '64.38815']
+ ['inc' '-0.34822' '1.18219']
+ ['crime' '-1.35656' '0.72482']
+ ['lag_hoval' '-0.57657' '0.75856']
+ ['lambda' '0.65608' '0.15719']]
+ """
+
+ def__init__(
+ self,
+ y,
+ x,
+ yend=None,
+ q=None,
+ w=None,
+ w_lags=1,
+ lag_q=True,
+ max_iter=1,
+ epsilon=0.00001,
+ step1c=False,
+ inv_method="power_exp",
+ hard_bound=False,
+ ):
+
+ BaseGM_Endog_Error_Het.__init__(
+ self,
+ y=y,
+ x=x,
+ w=w,
+ yend=yend,
+ q=q,
+ max_iter=max_iter,
+ step1c=step1c,
+ epsilon=epsilon,
+ inv_method=inv_method,
+ hard_bound=hard_bound,
+ )
+
+
+
+[docs]
+classGM_Combo_Het(BaseGM_Combo_Het):
+
+"""
+ GMM method for a spatial lag and error model with heteroskedasticity and
+ endogenous variables, with results and diagnostics; based on
+ :cite:`Arraiz2010`, following :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : pysal W object
+ Spatial weights object (always needed)
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the General Nesting
+ Spatial Model (GNSM) type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from :cite:`Arraiz2010`.
+ inv_method : string
+ If "power_exp", then compute inverse using the power
+ expansion. If "true_inv", then compute the true inverse.
+ Note that true_inv will fail for large n.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+ hth : float
+ :math:`H'H`
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import GM_Combo_Het
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the HOVAL column (home values) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this class adds a vector of ones to the
+ independent variables passed in.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, his allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ The Combo class runs an SARAR model, that is a spatial lag+error model.
+ In this case we will run a simple version of that, where we have the
+ spatial effects as well as exogenous variables. Since it is a spatial
+ model, we have to pass in the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> reg = GM_Combo_Het(y, X, w=w, step1c=True, name_y='hoval', name_x=['income'], name_ds='columbus')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that explicitly accounts
+ for heteroskedasticity and that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter. Hence, we find the same number of betas as of standard errors,
+ which we calculate taking the square root of the diagonal of the
+ variance-covariance matrix:
+
+ >>> print(reg.name_z)
+ ['CONSTANT', 'income', 'W_hoval', 'lambda']
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[ 9.9753 14.1435]
+ [ 1.5742 0.374 ]
+ [ 0.1535 0.3978]
+ [ 0.2103 0.3924]]
+
+ This class also allows the user to run a spatial lag+error model with the
+ extra feature of including non-spatial endogenous regressors. This means
+ that, in addition to the spatial lag and error, we consider some of the
+ variables on the right-hand side of the equation as endogenous and we
+ instrument for this. As an example, we will include CRIME (crime rates) as
+ endogenous and will instrument with DISCBD (distance to the CSB). We first
+ need to read in the variables:
+
+ >>> yd = []
+ >>> yd.append(db.by_col("CRIME"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ And then we can run and explore the model analogously to the previous combo:
+
+ >>> reg = GM_Combo_Het(y, X, yd, q, w=w, step1c=True, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus')
+ >>> print(reg.name_z)
+ ['CONSTANT', 'inc', 'crime', 'W_hoval', 'lambda']
+ >>> print(np.round(reg.betas,4))
+ [[113.9129]
+ [ -0.3482]
+ [ -1.3566]
+ [ -0.5766]
+ [ 0.6561]]
+
+ """
+
+
+[docs]
+classGM_Error_Het_Regimes(RegressionPropsY,REGI.Regimes_Frame):
+"""
+ GMM method for a spatial error model with heteroskedasticity and regimes;
+ based on Arraiz et al :cite:`Arraiz2010`, following Anselin :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default)
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ Always False, kept for consistency, ignored.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from Arraiz
+ et al. Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from Arraiz et al. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from Arraiz et al.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes
+
+ * 'many': a vector of ones is appended to x and considered different per regime
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path('natregimes.dbf'),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial error model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> reg = GM_Error_Het_Regimes(y, x, regimes, w=w, step1c=True, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT.dbf')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that explicitly accounts
+ for heteroskedasticity and that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter. Alternatively, we can have a summary of the
+ output by typing: model.summary
+
+ >>> print(reg.name_x)
+ ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', 'lambda']
+ >>> np.around(reg.betas, decimals=6)
+ array([[0.009121],
+ [0.812973],
+ [0.549355],
+ [5.00279 ],
+ [1.200929],
+ [0.614681],
+ [0.429277]])
+ >>> np.around(reg.std_err, decimals=6)
+ array([0.355844, 0.221743, 0.059276, 0.686764, 0.35843 , 0.092788,
+ 0.02524 ])
+
+ """
+
+
+[docs]
+classGM_Endog_Error_Het_Regimes(RegressionPropsY,REGI.Regimes_Frame):
+"""
+ GMM method for a spatial error model with heteroskedasticity, regimes and
+ endogenous variables, with results and diagnostics; based on Arraiz et al
+ :cite:`Arraiz2010`, following Anselin :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep : boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep : boolean
+ Always False, kept for consistency, ignored.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from :cite:`Arraiz2010`.
+ inv_method : string
+ If "power_exp", then compute inverse using the power
+ expansion. If "true_inv", then compute the true inverse.
+ Note that true_inv will fail for large n.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant).
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z : array
+ nxk array of variables (combination of x and yend)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ h : array
+ nxl array of instruments (combination of x and q)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi : string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep : boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ For the endogenous models, we add the endogenous variable RD90 (resource deprivation)
+ and we decide to instrument for it with FP89 (families below poverty):
+
+ >>> yd_var = ['RD90']
+ >>> yend = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables (exogenous and endogenous), the
+ instruments and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> reg = GM_Endog_Error_Het_Regimes(y, x, yend, q, regimes, w=w, step1c=True, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT.dbf')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that explicitly accounts
+ for heteroskedasticity and that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter. Hence, we find the same number of betas as of standard errors,
+ which we calculate taking the square root of the diagonal of the
+ variance-covariance matrix Alternatively, we can have a summary of the
+ output by typing: model.summary
+
+ >>> print(reg.name_z)
+ ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '0_RD90', '1_RD90', 'lambda']
+
+ >>> print(np.around(reg.betas,4))
+ [[ 3.5944]
+ [ 1.065 ]
+ [ 0.1587]
+ [ 9.184 ]
+ [ 1.8784]
+ [-0.2466]
+ [ 2.4617]
+ [ 3.5756]
+ [ 0.2908]]
+
+ >>> print(np.around(np.sqrt(reg.vm.diagonal()),4))
+ [0.5043 0.2132 0.0581 0.6681 0.3504 0.0999 0.3686 0.3402 0.028 ]
+
+ """
+
+
+[docs]
+classGM_Combo_Het_Regimes(GM_Endog_Error_Het_Regimes):
+"""
+ GMM method for a spatial lag and error model with heteroskedasticity,
+ regimes and endogenous variables, with results and diagnostics;
+ based on Arraiz et al :cite:`Arraiz2010`, following Anselin :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object (always needed)
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep : boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep : boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed across regimes.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the GNSM type.
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ step1c : boolean
+ If True, then include Step 1c from :cite:`Arraiz2010`.
+ inv_method : string
+ If "power_exp", then compute inverse using the power
+ expansion. If "true_inv", then compute the true inverse.
+ Note that true_inv will fail for large n.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z : array
+ nxk array of variables (combination of x and yend)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ h : array
+ nxl array of instruments (combination of x and q)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi : string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed across regimes.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial combo model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ Example only with spatial lag
+
+ The Combo class runs an SARAR model, that is a spatial lag+error model.
+ In this case we will run a simple version of that, where we have the
+ spatial effects as well as exogenous variables. Since it is a spatial
+ model, we have to pass in the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional. We can have a
+ summary of the output by typing: model.summary
+ Alternatively, we can check the betas:
+
+ >>> reg = GM_Combo_Het_Regimes(y, x, regimes, w=w, step1c=True, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT')
+ >>> print(reg.name_z)
+ ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '_Global_W_HR90', 'lambda']
+ >>> print(np.around(reg.betas,4))
+ [[ 1.4613]
+ [ 0.9587]
+ [ 0.5658]
+ [ 9.1157]
+ [ 1.1324]
+ [ 0.6518]
+ [-0.4587]
+ [ 0.7174]]
+
+ This class also allows the user to run a spatial lag+error model with the
+ extra feature of including non-spatial endogenous regressors. This means
+ that, in addition to the spatial lag and error, we consider some of the
+ variables on the right-hand side of the equation as endogenous and we
+ instrument for this. In this case we consider RD90 (resource deprivation)
+ as an endogenous regressor. We use FP89 (families below poverty)
+ for this and hence put it in the instruments parameter, 'q'.
+
+ >>> yd_var = ['RD90']
+ >>> yd = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ And then we can run and explore the model analogously to the previous combo:
+
+ >>> reg = GM_Combo_Het_Regimes(y, x, regimes, yd, q, w=w, step1c=True, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT')
+ >>> print(reg.name_z)
+ ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '0_RD90', '1_RD90', '_Global_W_HR90', 'lambda']
+ >>> print(reg.betas)
+ [[ 3.41936197]
+ [ 1.04071048]
+ [ 0.16747219]
+ [ 8.85820215]
+ [ 1.847382 ]
+ [-0.24545394]
+ [ 2.43189808]
+ [ 3.61328423]
+ [ 0.03132164]
+ [ 0.29544224]]
+ >>> print(np.sqrt(reg.vm.diagonal()))
+ [0.53103804 0.20835827 0.05755679 1.00496234 0.34332131 0.10259525
+ 0.3454436 0.37932794 0.07611667 0.07067059]
+ >>> print('lambda: ', np.around(reg.betas[-1], 4))
+ lambda: [0.2954]
+
+ """
+
+
+"""
+Hom family of models based on: :cite:`Drukker2013`
+Following: :cite:`Anselin2011`
+
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, Daniel Arribas-Bel darribas@asu.edu"
+
+fromscipyimportsparseasSP
+importnumpyasnp
+fromnumpyimportlinalgasla
+from.importolsasOLS
+from.utilsimportset_endog,iter_msg,sp_att
+from.utilsimportget_A1_hom,get_A2_hom,get_A1_het,optim_moments
+from.utilsimportget_spFilter,get_lags
+from.utilsimportspdot,RegressionPropsY,set_warn
+from.importtwoslsasTSLS
+from.importuser_outputasUSER
+importpandasaspd
+from.outputimportoutput,_spat_pseudo_r2,_summary_iteration
+fromitertoolsimportcompress
+
+__all__=["GM_Error_Hom","GM_Endog_Error_Hom","GM_Combo_Hom"]
+
+
+classBaseGM_Error_Hom(RegressionPropsY):
+"""
+ GMM method for a spatial error model with homoskedasticity (note: no
+ consistency checks, diagnostics or constant added); based on
+ Drukker et al. (2013) :cite:`Drukker2013`, following Anselin (2011) :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in :cite:`Arraiz2010`.
+ If A1='hom', then as in :cite:`Anselin2011` (default). If
+ A1='hom_sc' (default), then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ xtx : float
+ :math:`X'X`
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("CRIME"))
+ >>> X = np.array(X).T
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = 'r'
+
+ Model commands
+
+ >>> reg = BaseGM_Error_Hom(y, X, w=w.sparse, A1='hom_sc')
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[47.9479 12.3021]
+ [ 0.7063 0.4967]
+ [-0.556 0.179 ]
+ [ 0.4129 0.1835]]
+ >>> print(np.around(reg.vm, 4)) #doctest: +SKIP
+ [[ 1.51340700e+02 -5.29060000e+00 -1.85650000e+00 -2.40000000e-03]
+ [ -5.29060000e+00 2.46700000e-01 5.14000000e-02 3.00000000e-04]
+ [ -1.85650000e+00 5.14000000e-02 3.21000000e-02 -1.00000000e-04]
+ [ -2.40000000e-03 3.00000000e-04 -1.00000000e-04 3.37000000e-02]]
+ """
+
+ def__init__(
+ self,y,x,w,max_iter=1,epsilon=0.00001,A1="hom_sc",hard_bound=False
+ ):
+ ifA1=="hom":
+ wA1=get_A1_hom(w)
+ elifA1=="hom_sc":
+ wA1=get_A1_hom(w,scalarKP=True)
+ elifA1=="het":
+ wA1=get_A1_het(w)
+
+ wA2=get_A2_hom(w)
+
+ # 1a. OLS --> \tilde{\delta}
+ ols=OLS.BaseOLS(y=y,x=x)
+ self.x,self.y,self.n,self.k,self.xtx=ols.x,ols.y,ols.n,ols.k,ols.xtx
+
+ # 1b. GM --> \tilde{\rho}
+ moments=moments_hom(w,wA1,wA2,ols.u)
+ lambda1=optim_moments(moments,hard_bound=hard_bound)
+ lambda_old=lambda1
+
+ self.iteration,eps=0,1
+ whileself.iteration<max_iterandeps>epsilon:
+ # 2a. SWLS --> \hat{\delta}
+ x_s=get_spFilter(w,lambda_old,self.x)
+ y_s=get_spFilter(w,lambda_old,self.y)
+ ols_s=OLS.BaseOLS(y=y_s,x=x_s)
+ self.predy=spdot(self.x,ols_s.betas)
+ self.u=self.y-self.predy
+
+ # 2b. GM 2nd iteration --> \hat{\rho}
+ moments=moments_hom(w,wA1,wA2,self.u)
+ psi=get_vc_hom(w,wA1,wA2,self,lambda_old)[0]
+ lambda2=optim_moments(moments,psi,hard_bound=hard_bound)
+ eps=abs(lambda2-lambda_old)
+ lambda_old=lambda2
+ self.iteration+=1
+
+ self.iter_stop=iter_msg(self.iteration,max_iter)
+
+ # Output
+ self.betas=np.vstack((ols_s.betas,lambda2))
+ self.vm,self.sig2=get_omega_hom_ols(w,wA1,wA2,self,lambda2,moments[0])
+ self.e_filtered=self.u-lambda2*w*self.u
+ self._cache={}
+
+
+
+[docs]
+classGM_Error_Hom(BaseGM_Error_Hom):
+"""
+ GMM method for a spatial error model with homoskedasticity, with results
+ and diagnostics; based on Drukker et al. (2013) :cite:`Drukker2013`, following Anselin
+ (2011) :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : pysal W object
+ Spatial weights object
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in Arraiz et
+ al. If A1='hom', then as in :cite:`Anselin2011`. If
+ A1='hom_sc' (default), then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from Arraiz et al.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ xtx : float
+ :math:`X'X`
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the HOVAL column (home values) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) and CRIME (crime) vectors from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this class adds a vector of ones to the
+ independent variables passed in.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("CRIME"))
+ >>> X = np.array(X).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, his allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminars, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> reg = GM_Error_Hom(y, X, w=w, A1='hom_sc', name_y='home value', name_x=['income', 'crime'], name_ds='columbus')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that assumes
+ homoskedasticity but that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter. This is why you obtain as many coefficient estimates as
+ standard errors, which you calculate taking the square root of the
+ diagonal of the variance-covariance matrix of the parameters:
+
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[47.9479 12.3021]
+ [ 0.7063 0.4967]
+ [-0.556 0.179 ]
+ [ 0.4129 0.1835]]
+
+ """
+
+
+
+
+
+classBaseGM_Endog_Error_Hom(RegressionPropsY):
+"""
+ GMM method for a spatial error model with homoskedasticity and
+ endogenous variables (note: no consistency checks, diagnostics or constant
+ added); based on Drukker et al. (2013) :cite:`Drukker2013`, following Anselin (2011)
+ :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in Arraiz et
+ al. If A1='hom', then as in :cite:`Anselin2011`. If
+ A1='hom_sc' (default), then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ hth : float
+ H'H
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> yd = []
+ >>> yd.append(db.by_col("CRIME"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = 'r'
+ >>> reg = BaseGM_Endog_Error_Hom(y, X, yd, q, w=w.sparse, A1='hom_sc')
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[55.3658 23.496 ]
+ [ 0.4643 0.7382]
+ [-0.669 0.3943]
+ [ 0.4321 0.1927]]
+
+
+ """
+
+ def__init__(
+ self,
+ y,
+ x,
+ yend,
+ q,
+ w,
+ max_iter=1,
+ epsilon=0.00001,
+ A1="hom_sc",
+ hard_bound=False,
+ ):
+ ifA1=="hom":
+ wA1=get_A1_hom(w)
+ elifA1=="hom_sc":
+ wA1=get_A1_hom(w,scalarKP=True)
+ elifA1=="het":
+ wA1=get_A1_het(w)
+
+ wA2=get_A2_hom(w)
+
+ # 1a. S2SLS --> \tilde{\delta}
+ tsls=TSLS.BaseTSLS(y=y,x=x,yend=yend,q=q)
+ self.x,self.z,self.h,self.y,self.hth=(
+ tsls.x,
+ tsls.z,
+ tsls.h,
+ tsls.y,
+ tsls.hth,
+ )
+ self.yend,self.q,self.n,self.k=tsls.yend,tsls.q,tsls.n,tsls.k
+
+ # 1b. GM --> \tilde{\rho}
+ moments=moments_hom(w,wA1,wA2,tsls.u)
+ lambda1=optim_moments(moments,hard_bound=hard_bound)
+ lambda_old=lambda1
+
+ self.iteration,eps=0,1
+ whileself.iteration<max_iterandeps>epsilon:
+ # 2a. GS2SLS --> \hat{\delta}
+ x_s=get_spFilter(w,lambda_old,self.x)
+ y_s=get_spFilter(w,lambda_old,self.y)
+ yend_s=get_spFilter(w,lambda_old,self.yend)
+ tsls_s=TSLS.BaseTSLS(y=y_s,x=x_s,yend=yend_s,h=self.h)
+ self.predy=spdot(self.z,tsls_s.betas)
+ self.u=self.y-self.predy
+
+ # 2b. GM 2nd iteration --> \hat{\rho}
+ moments=moments_hom(w,wA1,wA2,self.u)
+ psi=get_vc_hom(w,wA1,wA2,self,lambda_old,tsls_s.z)[0]
+ lambda2=optim_moments(moments,psi,hard_bound=hard_bound)
+ eps=abs(lambda2-lambda_old)
+ lambda_old=lambda2
+ self.iteration+=1
+
+ self.iter_stop=iter_msg(self.iteration,max_iter)
+
+ # Output
+ self.betas=np.vstack((tsls_s.betas,lambda2))
+ self.vm,self.sig2=get_omega_hom(w,wA1,wA2,self,lambda2,moments[0])
+ self.e_filtered=self.u-lambda2*w*self.u
+ self._cache={}
+
+
+
+[docs]
+classGM_Endog_Error_Hom(BaseGM_Endog_Error_Hom):
+"""
+ GMM method for a spatial error model with homoskedasticity and endogenous
+ variables, with results and diagnostics; based on Drukker et al. (2013)
+ :cite:`Drukker2013`, following Anselin (2011) :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : pysal W object
+ Spatial weights object
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in :cite:`Arraiz2010`.
+ If A1='hom', then as in :cite:`Anselin2011`. If
+ A1='hom_sc' (default), then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ sig2 : float
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+ hth : float
+ :math:`H'H`
+
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the HOVAL column (home values) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this class adds a vector of ones to the
+ independent variables passed in.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+
+ In this case we consider CRIME (crime rates) is an endogenous regressor.
+ We tell the model that this is so by passing it in a different parameter
+ from the exogenous variables (x).
+
+ >>> yd = []
+ >>> yd.append(db.by_col("CRIME"))
+ >>> yd = np.array(yd).T
+
+ Because we have endogenous variables, to obtain a correct estimate of the
+ model, we need to instrument for CRIME. We use DISCBD (distance to the
+ CBD) for this and hence put it in the instruments parameter, 'q'.
+
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, his allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminars, we are good to run the model. In this
+ case, we will need the variables (exogenous and endogenous), the
+ instruments and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> reg = GM_Endog_Error_Hom(y, X, yd, q, w=w, A1='hom_sc', name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that assumes
+ homoskedasticity but that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter. Hence, we find the same number of betas as of standard errors,
+ which we calculate taking the square root of the diagonal of the
+ variance-covariance matrix:
+
+ >>> print(reg.name_z)
+ ['CONSTANT', 'inc', 'crime', 'lambda']
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[55.3658 23.496 ]
+ [ 0.4643 0.7382]
+ [-0.669 0.3943]
+ [ 0.4321 0.1927]]
+ """
+
+
+
+
+
+classBaseGM_Combo_Hom(BaseGM_Endog_Error_Hom):
+"""
+ GMM method for a spatial lag and error model with homoskedasticity and
+ endogenous variables (note: no consistency checks, diagnostics or constant
+ added); based on Drukker et al. (2013) :cite:`Drukker2013`, following Anselin (2011)
+ :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in Arraiz et
+ al. If A1='hom', then as in :cite:`Anselin2011`. If
+ A1='hom_sc' (default), then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ hth : float
+ :math:`H'H`
+
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = 'r'
+ >>> w_lags = 1
+ >>> yd2, q2 = spreg.set_endog(y, X, w, None, None, w_lags, True)
+ >>> X = np.hstack((np.ones(y.shape),X))
+
+ Example only with spatial lag
+
+ >>> reg = spreg.error_sp_hom.BaseGM_Combo_Hom(y, X, yend=yd2, q=q2, w=w.sparse, A1='hom_sc')
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[10.1254 15.2871]
+ [ 1.5683 0.4407]
+ [ 0.1513 0.4048]
+ [ 0.2103 0.4226]]
+
+
+ Example with both spatial lag and other endogenous variables
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+ >>> yd = []
+ >>> yd.append(db.by_col("CRIME"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+ >>> yd2, q2 = spreg.set_endog(y, X, w, yd, q, w_lags, True)
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> reg = spreg.error_sp_hom.BaseGM_Combo_Hom(y, X, yd2, q2, w=w.sparse, A1='hom_sc')
+ >>> betas = np.array([['CONSTANT'],['inc'],['crime'],['W_hoval'],['lambda']])
+ >>> print(np.hstack((betas, np.around(np.hstack((reg.betas, np.sqrt(reg.vm.diagonal()).reshape(5,1))),5))))
+ [['CONSTANT' '111.77057' '67.75191']
+ ['inc' '-0.30974' '1.16656']
+ ['crime' '-1.36043' '0.6841']
+ ['W_hoval' '-0.52908' '0.84428']
+ ['lambda' '0.60116' '0.18605']]
+
+ """
+
+ def__init__(
+ self,
+ y,
+ x,
+ yend=None,
+ q=None,
+ w=None,
+ w_lags=1,
+ lag_q=True,
+ max_iter=1,
+ epsilon=0.00001,
+ A1="hom_sc",
+ hard_bound=False,
+ ):
+ BaseGM_Endog_Error_Hom.__init__(
+ self,
+ y=y,
+ x=x,
+ w=w,
+ yend=yend,
+ q=q,
+ A1=A1,
+ max_iter=max_iter,
+ epsilon=epsilon,
+ hard_bound=hard_bound,
+ )
+
+
+
+[docs]
+classGM_Combo_Hom(BaseGM_Combo_Hom):
+"""
+ GMM method for a spatial lag and error model with homoskedasticity and
+ endogenous variables, with results and diagnostics; based on Drukker et
+ al. (2013) :cite:`Drukker2013`, following Anselin (2011) :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : pysal W object
+ Spatial weights object (always necessary)
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the General Nesting
+ Spatial Model (GNSM) type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in :cite:`Arraiz2010`.
+ If A1='hom', then as in :cite:`Anselin2011`. If
+ A1='hom_sc' (default), then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ sig2 : float
+ Sigma squared used in computations (based on filtered
+ residuals)
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+ hth : float
+ :math:`H'H`
+
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the HOVAL column (home values) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this class adds a vector of ones to the
+ independent variables passed in.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, his allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ Example only with spatial lag
+
+ The Combo class runs an SARAR model, that is a spatial lag+error model.
+ In this case we will run a simple version of that, where we have the
+ spatial effects as well as exogenous variables. Since it is a spatial
+ model, we have to pass in the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Combo_Hom
+ >>> reg = GM_Combo_Hom(y, X, w=w, A1='hom_sc', name_x=['inc'],\
+ name_y='hoval', name_yend=['crime'], name_q=['discbd'],\
+ name_ds='columbus')
+ >>> print(np.around(np.hstack((reg.betas,np.sqrt(reg.vm.diagonal()).reshape(4,1))),4))
+ [[10.1254 15.2871]
+ [ 1.5683 0.4407]
+ [ 0.1513 0.4048]
+ [ 0.2103 0.4226]]
+
+ This class also allows the user to run a spatial lag+error model with the
+ extra feature of including non-spatial endogenous regressors. This means
+ that, in addition to the spatial lag and error, we consider some of the
+ variables on the right-hand side of the equation as endogenous and we
+ instrument for this. As an example, we will include CRIME (crime rates) as
+ endogenous and will instrument with DISCBD (distance to the CSB). We first
+ need to read in the variables:
+
+
+ >>> yd = []
+ >>> yd.append(db.by_col("CRIME"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ And then we can run and explore the model analogously to the previous combo:
+
+ >>> reg = GM_Combo_Hom(y, X, yd, q, w=w, A1='hom_sc', \
+ name_ds='columbus')
+ >>> betas = np.array([['CONSTANT'],['inc'],['crime'],['W_hoval'],['lambda']])
+ >>> print(np.hstack((betas, np.around(np.hstack((reg.betas, np.sqrt(reg.vm.diagonal()).reshape(5,1))),5))))
+ [['CONSTANT' '111.77057' '67.75191']
+ ['inc' '-0.30974' '1.16656']
+ ['crime' '-1.36043' '0.6841']
+ ['W_hoval' '-0.52908' '0.84428']
+ ['lambda' '0.60116' '0.18605']]
+
+ """
+
+
+"""
+Hom family of models with regimes.
+"""
+
+__author__="Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu, Daniel Arribas-Bel darribas@asu.edu"
+
+fromscipyimportsparseasSP
+importnumpyasnp
+importmultiprocessingasmp
+fromnumpyimportlinalgasla
+fromlibpysal.weights.spatial_lagimportlag_spatial
+from.utilsimportpower_expansion,set_endog,iter_msg,sp_att
+from.utilsimportget_A1_hom,get_A2_hom,get_A1_het,optim_moments
+from.utilsimportget_spFilter,get_lags,_moments2eqs
+from.utilsimportspdot,RegressionPropsY,set_warn
+from.sputilsimportsphstack
+from.olsimportBaseOLS
+from.twoslsimportBaseTSLS
+from.error_sp_homimport(
+ BaseGM_Error_Hom,
+ BaseGM_Endog_Error_Hom,
+ moments_hom,
+ get_vc_hom,
+ get_omega_hom,
+ get_omega_hom_ols,
+)
+from.importregimesasREGI
+from.importuser_outputasUSER
+fromplatformimportsystem
+importpandasaspd
+from.outputimportoutput,_summary_iteration,_spat_pseudo_r2
+
+
+
+[docs]
+classGM_Error_Hom_Regimes(RegressionPropsY,REGI.Regimes_Frame):
+"""
+ GMM method for a spatial error model with homoskedasticity, with regimes,
+ results and diagnostics; based on Drukker et al. (2013) :cite:`Drukker2013`, following
+ Anselin (2011) :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ Always False, kept for consistency, ignored.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional
+ stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in
+ :cite:`Arraiz2010`. If A1='hom', then as in :cite:`Anselin2011`. If
+ A1='hom_sc', then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ xtx : float
+ :math:`X'X`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("NAT.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial lag model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("NAT.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Error_Hom_Regimes
+ >>> reg = GM_Error_Hom_Regimes(y, x, regimes, w=w, name_y=y_var, name_x=x_var, name_ds='NAT')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that assumes
+ homoskedasticity but that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter. This is why you obtain as many coefficient estimates as
+ standard errors, which you calculate taking the square root of the
+ diagonal of the variance-covariance matrix of the parameters. Alternatively,
+ we can have a summary of the output by typing: model.summary
+ >>> print(reg.name_x)
+ ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', 'lambda']
+
+ >>> print(np.around(reg.betas,4))
+ [[0.069 ]
+ [0.7885]
+ [0.5398]
+ [5.0948]
+ [1.1965]
+ [0.6018]
+ [0.4104]]
+
+ >>> print(np.sqrt(reg.vm.diagonal()))
+ [0.39105853 0.15664624 0.05254328 0.48379958 0.20018799 0.05834139
+ 0.01882401]
+ """
+
+
+[docs]
+classGM_Endog_Error_Hom_Regimes(RegressionPropsY,REGI.Regimes_Frame):
+"""
+ GMM method for a spatial error model with homoskedasticity, regimes and
+ endogenous variables.
+ Based on Drukker et al. (2013) :cite:`Drukker2013`, following Anselin (2011)
+ :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep : boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep : boolean
+ Always False, kept for consistency, ignored.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from
+ :cite:`Arraiz2010`. Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in
+ :cite:`Arraiz2010`. If A1='hom', then as in :cite:`Anselin2011`. If
+ A1='hom_sc', then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z : array
+ nxk array of variables (combination of x and yend)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ h : array
+ nxl array of instruments (combination of x and q)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ hth : float
+ :math:`H'H`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi : ['one', 'many']
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep : boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ For the endogenous models, we add the endogenous variable RD90 (resource deprivation)
+ and we decide to instrument for it with FP89 (families below poverty):
+
+ >>> yd_var = ['RD90']
+ >>> yend = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables (exogenous and endogenous), the
+ instruments and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Endog_Error_Hom_Regimes
+ >>> reg = GM_Endog_Error_Hom_Regimes(y, x, yend, q, regimes, w=w, A1='hom_sc', name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT.dbf')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. This class offers an error model that assumes
+ homoskedasticity but that unlike the models from
+ ``spreg.error_sp``, it allows for inference on the spatial
+ parameter. Hence, we find the same number of betas as of standard errors,
+ which we calculate taking the square root of the diagonal of the
+ variance-covariance matrix. Alternatively, we can have a summary of the
+ output by typing: model.summary
+
+ >>> print(reg.name_z)
+ ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '0_RD90', '1_RD90', 'lambda']
+
+ >>> print(np.around(reg.betas,4))
+ [[ 3.5973]
+ [ 1.0652]
+ [ 0.1582]
+ [ 9.198 ]
+ [ 1.8809]
+ [-0.2489]
+ [ 2.4616]
+ [ 3.5796]
+ [ 0.2541]]
+
+ >>> print(np.around(np.sqrt(reg.vm.diagonal()),4))
+ [0.5204 0.1371 0.0629 0.4721 0.1824 0.0725 0.2992 0.2395 0.024 ]
+
+ """
+
+
+[docs]
+classGM_Combo_Hom_Regimes(GM_Endog_Error_Hom_Regimes):
+"""
+ GMM method for a spatial lag and error model with homoskedasticity,
+ regimes and endogenous variables, with results and diagnostics;
+ based on Drukker et al. (2013) :cite:`Drukker2013`, following Anselin (2011)
+ :cite:`Anselin2011`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object (always needed)
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep : boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep : boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed across regimes.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the GNSM type.
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ max_iter : int
+ Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Note: epsilon provides an additional stop condition.
+ epsilon : float
+ Minimum change in lambda required to stop iterations of
+ steps 2a and 2b from :cite:`Arraiz2010`. Note: max_iter provides
+ an additional stop condition.
+ A1 : string
+ If A1='het', then the matrix A1 is defined as in
+ :cite:`Arraiz2010`. If A1='hom', then as in :cite:`Anselin2011`. If
+ A1='hom_sc', then as in :cite:`Drukker2013`
+ and :cite:`Drukker:2013aa`.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z : array
+ nxk array of variables (combination of x and yend)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ h : array
+ nxl array of instruments (combination of x and q)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iter_stop : string
+ Stop criterion reached during iteration of steps 2a and 2b
+ from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ iteration : integer
+ Number of iterations of steps 2a and 2b from :cite:`Arraiz2010`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2 : float
+ Sigma squared used in computations (based on filtered
+ residuals)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi : string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep : boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep : boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed across regimes.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial combo model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``natregimes.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ Example only with spatial lag
+
+ The Combo class runs an SARAR model, that is a spatial lag+error model.
+ In this case we will run a simple version of that, where we have the
+ spatial effects as well as exogenous variables. Since it is a spatial
+ model, we have to pass in the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional. We can have a
+ summary of the output by typing: model.summary
+ Alternatively, we can check the betas:
+
+ >>> from spreg import GM_Combo_Hom_Regimes
+ >>> reg = GM_Combo_Hom_Regimes(y, x, regimes, w=w, A1='hom_sc', name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT')
+ >>> print(reg.name_z)
+ ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '_Global_W_HR90', 'lambda']
+ >>> print(np.around(reg.betas,4))
+ [[ 1.4607]
+ [ 0.9579]
+ [ 0.5658]
+ [ 9.1129]
+ [ 1.1339]
+ [ 0.6517]
+ [-0.4583]
+ [ 0.6634]]
+
+ This class also allows the user to run a spatial lag+error model with the
+ extra feature of including non-spatial endogenous regressors. This means
+ that, in addition to the spatial lag and error, we consider some of the
+ variables on the right-hand side of the equation as endogenous and we
+ instrument for this. In this case we consider RD90 (resource deprivation)
+ as an endogenous regressor. We use FP89 (families below poverty)
+ for this and hence put it in the instruments parameter, 'q'.
+
+ >>> yd_var = ['RD90']
+ >>> yd = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ And then we can run and explore the model analogously to the previous combo:
+
+ >>> reg = GM_Combo_Hom_Regimes(y, x, regimes, yd, q, w=w, A1='hom_sc', name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT')
+ >>> print(reg.name_z)
+ ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '0_RD90', '1_RD90', '_Global_W_HR90', 'lambda']
+ >>> print(reg.betas)
+ [[ 3.4196478 ]
+ [ 1.04065595]
+ [ 0.16630304]
+ [ 8.86570777]
+ [ 1.85134286]
+ [-0.24921597]
+ [ 2.43007651]
+ [ 3.61656899]
+ [ 0.03315061]
+ [ 0.22636055]]
+ >>> print(np.sqrt(reg.vm.diagonal()))
+ [0.53989913 0.13506086 0.06143434 0.77049956 0.18089997 0.07246848
+ 0.29218837 0.25378655 0.06184801 0.06323236]
+ >>> print('lambda: ', np.around(reg.betas[-1], 4))
+ lambda: [0.2264]
+
+ """
+
+
+[docs]
+classGM_Error_Regimes(RegressionPropsY,REGI.Regimes_Frame):
+"""
+ GMM method for a spatial error model with regimes, with results and diagnostics;
+ based on Kelejian and Prucha (1998, 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object
+ constant_regi: string, optional
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ Always False, kept for consistency, ignored.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes
+
+ * 'many': a vector of ones is appended to x and considered different per regime
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import libpysal
+ >>> import numpy as np
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial error model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Error_Regimes
+ >>> model = GM_Error_Regimes(y, x, regimes, w=w, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT.dbf')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. Note that because we are running the classical GMM error
+ model from 1998/99, the spatial parameter is obtained as a point estimate, so
+ although you get a value for it (there are for coefficients under
+ model.betas), you cannot perform inference on it (there are only three
+ values in model.se_betas). Alternatively, we can have a summary of the
+ output by typing: model.summary
+
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 0_CONSTANT 0.074811 0.379864 0.196942 0.843873
+ 1 0_PS90 0.786105 0.152315 5.161043 0.0
+ 2 0_UE90 0.538848 0.051942 10.373969 0.0
+ 3 1_CONSTANT 5.103761 0.471284 10.82949 0.0
+ 4 1_PS90 1.196009 0.19867 6.020074 0.0
+ 5 1_UE90 0.600532 0.057252 10.489217 0.0
+ 6 lambda 0.3641 None None None
+ """
+
+
+[docs]
+classGM_Endog_Error_Regimes(RegressionPropsY,REGI.Regimes_Frame):
+"""
+ GMM method for a spatial error model with regimes and endogenous variables, with
+ results and diagnostics; based on Kelejian and Prucha (1998,
+ 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ Always False, kept for consistency, ignored.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z : array
+ nxk array of variables (combination of x and yend)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2 : float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi : ['one', 'many']
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import libpysal
+ >>> import numpy as np
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ For the endogenous models, we add the endogenous variable RD90 (resource deprivation)
+ and we decide to instrument for it with FP89 (families below poverty):
+
+ >>> yd_var = ['RD90']
+ >>> yend = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables (exogenous and endogenous), the
+ instruments and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Endog_Error_Regimes
+ >>> model = GM_Endog_Error_Regimes(y, x, yend, q, regimes, w=w, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT.dbf')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. Note that because we are running the classical GMM error
+ model from 1998/99, the spatial parameter is obtained as a point estimate, so
+ although you get a value for it (there are for coefficients under
+ model.betas), you cannot perform inference on it (there are only three
+ values in model.se_betas). Also, this regression uses a two stage least
+ squares estimation method that accounts for the endogeneity created by the
+ endogenous variables included. Alternatively, we can have a summary of the
+ output by typing: model.summary
+
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 0_CONSTANT 3.597178 0.522633 6.882796 0.0
+ 1 0_PS90 1.065203 0.137555 7.743852 0.0
+ 2 0_UE90 0.15822 0.063054 2.509282 0.012098
+ 6 0_RD90 2.461609 0.300711 8.185967 0.0
+ 3 1_CONSTANT 9.197542 0.473654 19.418268 0.0
+ 4 1_PS90 1.880815 0.18335 10.258046 0.0
+ 5 1_UE90 -0.248777 0.072786 -3.417919 0.000631
+ 7 1_RD90 3.579429 0.240413 14.888666 0.0
+ 8 lambda 0.255639 None None None
+ """
+
+
+[docs]
+classGM_Combo_Regimes(GM_Endog_Error_Regimes,REGI.Regimes_Frame):
+"""
+ GMM method for a spatial lag and error model with regimes and endogenous
+ variables, with results and diagnostics; based on Kelejian and Prucha (1998,
+ 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : pysal W object
+ Spatial weights object (always needed)
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed accross regimes.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the GNSM type.
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z : array
+ nxk array of variables (combination of x and yend)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2 : float
+ Sigma squared used in computations (based on filtered
+ residuals)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi : string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed accross regimes.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial lag model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ The Combo class runs an SARAR model, that is a spatial lag+error model.
+ In this case we will run a simple version of that, where we have the
+ spatial effects as well as exogenous variables. Since it is a spatial
+ model, we have to pass in the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Combo_Regimes
+ >>> model = GM_Combo_Regimes(y, x, regimes, w=w, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them. Note that because we are running the classical GMM error
+ model from 1998/99, the spatial parameter is obtained as a point estimate, so
+ although you get a value for it (there are for coefficients under
+ model.betas), you cannot perform inference on it (there are only three
+ values in model.se_betas). Also, this regression uses a two stage least
+ squares estimation method that accounts for the endogeneity created by the
+ spatial lag of the dependent variable. We can have a summary of the
+ output by typing: model.summary
+ Alternatively, we can check the betas:
+
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 0_CONSTANT 1.460707 0.704174 2.074356 0.038046
+ 1 0_PS90 0.95795 0.171485 5.586214 0.0
+ 2 0_UE90 0.565805 0.053665 10.543203 0.0
+ 3 1_CONSTANT 9.112998 1.525875 5.972311 0.0
+ 4 1_PS90 1.13382 0.20552 5.51683 0.0
+ 5 1_UE90 0.65169 0.061106 10.664938 0.0
+ 6 _Global_W_HR90 -0.458326 0.145599 -3.147859 0.001645
+ 7 lambda 0.613599 None None None
+
+ This class also allows the user to run a spatial lag+error model with the
+ extra feature of including non-spatial endogenous regressors. This means
+ that, in addition to the spatial lag and error, we consider some of the
+ variables on the right-hand side of the equation as endogenous and we
+ instrument for this. In this case we consider RD90 (resource deprivation)
+ as an endogenous regressor. We use FP89 (families below poverty)
+ for this and hence put it in the instruments parameter, 'q'.
+
+ >>> yd_var = ['RD90']
+ >>> yd = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ And then we can run and explore the model analogously to the previous combo:
+
+ >>> model = GM_Combo_Regimes(y, x, regimes, yd, q, w=w, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT')
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 0_CONSTANT 3.419638 0.530676 6.443931 0.0
+ 1 0_PS90 1.040658 0.132714 7.841346 0.0
+ 2 0_UE90 0.166344 0.06058 2.745844 0.006036
+ 6 0_RD90 2.43014 0.289431 8.396263 0.0
+ 3 1_CONSTANT 8.865446 0.764064 11.603014 0.0
+ 4 1_PS90 1.851205 0.179698 10.301769 0.0
+ 5 1_UE90 -0.249085 0.071674 -3.475235 0.00051
+ 7 1_RD90 3.616455 0.253083 14.289586 0.0
+ 8 _Global_W_HR90 0.033087 0.061265 0.540057 0.589158
+ 9 lambda 0.18685 None None None
+ """
+
+
+
+
+
+classGMM_Error_Regimes(
+ GM_Error_Regimes,
+ GM_Combo_Regimes,
+ GM_Endog_Error_Regimes,
+ GM_Error_Het_Regimes,
+ GM_Combo_Het_Regimes,
+ GM_Endog_Error_Het_Regimes,
+ GM_Error_Hom_Regimes,
+ GM_Combo_Hom_Regimes,
+ GM_Endog_Error_Hom_Regimes,
+):
+"""
+ Wrapper function to call any of the GM methods for a spatial error regimes model available in spreg
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object (always needed)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable (if any)
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (if any)
+ (note: this should not contain any variables from x)
+ estimator : string
+ Choice of estimator to be used. Options are: 'het', which
+ is robust to heteroskedasticity, 'hom', which assumes
+ homoskedasticity, and 'kp98', which does not provide
+ inference on the spatial parameter for the error term.
+ constant_regi: string, optional
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ Always False, kept for consistency, ignored.
+ add_wy : boolean
+ If True, then a spatial lag of the dependent variable is included.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error or GNSM type.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the maximum/minimum bounds.
+ **kwargs : keywords
+ Additional arguments to pass on to the estimators.
+ See the specific functions for details on what can be used.
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ vm : array
+ Variance covariance matrix (kxk)
+ sig2 : float
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes
+
+ * 'many': a vector of ones is appended to x and considered different per regime
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ name_yend : list of strings (optional)
+ Names of endogenous variables for use in output
+ name_z : list of strings (optional)
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings (optional)
+ Names of external instruments
+ name_h : list of strings (optional)
+ Names of all instruments used in ouput
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``libpysal`` to
+ handle the weights and file management.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial error model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``NAT.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ The GMM_Error_Regimes class can run error models and SARAR models, that is a spatial lag+error model.
+ In this example we will run a simple version of the latter, where we have the
+ spatial effects as well as exogenous variables. Since it is a spatial
+ model, we have to pass in the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GMM_Error_Regimes
+ >>> model = GMM_Error_Regimes(y, x, regimes, w=w, add_wy=True, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them.
+
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 0_CONSTANT 1.461317 0.848361 1.722517 0.084976
+ 1 0_PS90 0.958711 0.239834 3.997388 0.000064
+ 2 0_UE90 0.565825 0.063726 8.879088 0.0
+ 3 1_CONSTANT 9.115738 1.976874 4.611189 0.000004
+ 4 1_PS90 1.132419 0.334107 3.389387 0.0007
+ 5 1_UE90 0.651804 0.105518 6.177197 0.0
+ 6 _Global_W_HR90 -0.458677 0.180997 -2.534173 0.011271
+ 7 lambda 0.734354 0.035255 20.829823 0.0
+
+ This class also allows the user to run a spatial lag+error model with the
+ extra feature of including non-spatial endogenous regressors. This means
+ that, in addition to the spatial lag and error, we consider some of the
+ variables on the right-hand side of the equation as endogenous and we
+ instrument for this. In this case we consider RD90 (resource deprivation)
+ as an endogenous regressor. We use FP89 (families below poverty)
+ for this and hence put it in the instruments parameter, 'q'.
+
+ >>> yd_var = ['RD90']
+ >>> yd = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ And then we can run and explore the model analogously to the previous combo:
+
+ >>> model = GMM_Error_Regimes(y, x, regimes, yend=yd, q=q, w=w, add_wy=True, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT')
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 0_CONSTANT 1.461317 0.848361 1.722517 0.084976
+ 1 0_PS90 0.958711 0.239834 3.997388 0.000064
+ 2 0_UE90 0.565825 0.063726 8.879088 0.0
+ 3 1_CONSTANT 9.115738 1.976874 4.611189 0.000004
+ 4 1_PS90 1.132419 0.334107 3.389387 0.0007
+ 5 1_UE90 0.651804 0.105518 6.177197 0.0
+ 6 _Global_W_HR90 -0.458677 0.180997 -2.534173 0.011271
+ 7 lambda 0.734354 0.035255 20.829823 0.0
+
+ The class also allows for estimating a GNS model by adding spatial lags of the exogenous variables, using the argument slx_lags:
+
+ >>> model = GMM_Error_Regimes(y, x, regimes, w=w, add_wy=True, slx_lags=1, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT')
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 0_CONSTANT 0.192699 0.256922 0.75003 0.453237
+ 1 0_PS90 1.098019 0.232054 4.731743 0.000002
+ 2 0_UE90 0.606622 0.07762 7.815325 0.0
+ 3 0_W_PS90 -1.068778 0.203911 -5.241381 0.0
+ 4 0_W_UE90 -0.657932 0.176073 -3.73671 0.000186
+ 5 1_CONSTANT -0.104299 1.790953 -0.058237 0.95356
+ 6 1_PS90 1.219796 0.316425 3.854936 0.000116
+ 7 1_UE90 0.678922 0.120491 5.634647 0.0
+ 8 1_W_PS90 -1.308599 0.536231 -2.440366 0.014672
+ 9 1_W_UE90 -0.708492 0.167057 -4.24102 0.000022
+ 10 _Global_W_HR90 1.033956 0.269252 3.840111 0.000123
+ 11 lambda -0.384968 0.192256 -2.002366 0.045245
+
+
+ """
+
+ def__init__(
+ self,
+ y,
+ x,
+ regimes,
+ w,
+ yend=None,
+ q=None,
+ estimator="het",
+ constant_regi="many",
+ cols2regi="all",
+ regime_err_sep=False,
+ regime_lag_sep=False,
+ add_wy=False,
+ slx_lags=0,
+ vm=False,
+ name_y=None,
+ name_x=None,
+ name_w=None,
+ name_regimes=None,
+ name_yend=None,
+ name_q=None,
+ name_ds=None,
+ latex=False,
+ **kwargs,
+ ):
+ ifestimator=="het":
+ ifyendisNoneandnotadd_wy:
+ GM_Error_Het_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ elifyendisnotNoneandnotadd_wy:
+ GM_Endog_Error_Het_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ yend=yend,
+ q=q,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_yend=name_yend,
+ name_q=name_q,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ elifadd_wy:
+ GM_Combo_Het_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ yend=yend,
+ q=q,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ regime_lag_sep=regime_lag_sep,
+ name_yend=name_yend,
+ name_q=name_q,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ else:
+ set_warn(
+ self,
+ "Combination of arguments passed to GMM_Error_Regimes not allowed. Using default arguments instead.",
+ )
+ GM_Error_Het_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ )
+ elifestimator=="hom":
+ ifyendisNoneandnotadd_wy:
+ GM_Error_Hom_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ elifyendisnotNoneandnotadd_wy:
+ GM_Endog_Error_Hom_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ yend=yend,
+ q=q,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_yend=name_yend,
+ name_q=name_q,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ elifadd_wy:
+ GM_Combo_Hom_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ yend=yend,
+ q=q,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ regime_lag_sep=regime_lag_sep,
+ name_yend=name_yend,
+ name_q=name_q,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ else:
+ set_warn(
+ self,
+ "Combination of arguments passed to GMM_Error_Regimes not allowed. Using default arguments instead.",
+ )
+ GM_Error_Hom_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ )
+ elifestimator=="kp98":
+ ifyendisNoneandnotadd_wy:
+ GM_Error_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ elifyendisnotNoneandnotadd_wy:
+ GM_Endog_Error_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ yend=yend,
+ q=q,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_yend=name_yend,
+ name_q=name_q,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ elifadd_wy:
+ GM_Combo_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ yend=yend,
+ q=q,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ regime_lag_sep=regime_lag_sep,
+ name_yend=name_yend,
+ name_q=name_q,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ **kwargs,
+ )
+ else:
+ set_warn(
+ self,
+ "Combination of arguments passed to GMM_Error_Regimes not allowed. Using default arguments instead.",
+ )
+ GM_Error_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ )
+ else:
+ set_warn(
+ self,
+ "Combination of arguments passed to GMM_Error_Regimes not allowed. Using default arguments instead.",
+ )
+ GM_Error_Het_Regimes.__init__(
+ self,
+ y=y,
+ x=x,
+ regimes=regimes,
+ w=w,
+ slx_lags=slx_lags,
+ vm=vm,
+ name_y=name_y,
+ name_x=name_x,
+ constant_regi=constant_regi,
+ cols2regi=cols2regi,
+ regime_err_sep=regime_err_sep,
+ name_w=name_w,
+ name_regimes=name_regimes,
+ name_ds=name_ds,
+ latex=latex,
+ )
+
+
+def_work_error(y,x,regi_ids,r,w,name_ds,name_y,name_x,name_w,name_regimes):
+ w_r,warn=REGI.w_regime(w,regi_ids[r],r,transform=True)
+ y_r=y[regi_ids[r]]
+ x_r=x[regi_ids[r]]
+ model=BaseGM_Error(y_r,x_r,w_r.sparse)
+ set_warn(model,warn)
+ model.w=w_r
+ model.title="GM SPATIALLY WEIGHTED LEAST SQUARES ESTIMATION - REGIME %s"%r
+ model.name_ds=name_ds
+ model.name_y="%s_%s"%(str(r),name_y)
+ model.name_x=["%s_%s"%(str(r),i)foriinname_x]
+ model.name_w=name_w
+ model.name_regimes=name_regimes
+ returnmodel
+
+
+def_work_endog_error(
+ y,
+ x,
+ yend,
+ q,
+ regi_ids,
+ r,
+ w,
+ name_ds,
+ name_y,
+ name_x,
+ name_yend,
+ name_q,
+ name_w,
+ name_regimes,
+ add_lag,
+ slx_lags,
+):
+ w_r,warn=REGI.w_regime(w,regi_ids[r],r,transform=True)
+ y_r=y[regi_ids[r]]
+ x_r=x[regi_ids[r]]
+ ifyendisnotNone:
+ yend_r=yend[regi_ids[r]]
+ q_r=q[regi_ids[r]]
+ else:
+ yend_r,q_r=None,None
+ ifadd_lag!=False:
+ yend_r,q_r=set_endog(
+ y_r,x_r[:,1:],w_r,yend_r,q_r,add_lag[0],add_lag[1]
+ )
+ model=BaseGM_Endog_Error(y_r,x_r,yend_r,q_r,w_r.sparse)
+ set_warn(model,warn)
+ ifadd_lag!=False:
+ model.rho=model.betas[-2]
+ model.predy_e,model.e_pred,warn=sp_att(
+ w_r,model.y,model.predy,model.yend[:,-1].reshape(model.n,1),model.rho
+ )
+ set_warn(model,warn)
+ model.w=w_r
+ ifslx_lags==0:
+ ifadd_lag!=False:
+ model.title="SPATIALLY WEIGHTED 2SLS - GM-COMBO MODEL - REGIME %s"%r
+ else:
+ model.title="SPATIALLY WEIGHTED 2SLS (GM) - REGIME %s"%r
+ else:
+ ifadd_lag!=False:
+ model.title="GM SPATIAL COMBO MODEL + SLX (GNSM) - REGIME %s"%r
+ else:
+ model.title="GM SPATIALLY WEIGHTED 2SLS + SLX (SLX-Error) - REGIME %s"%r
+ model.name_ds=name_ds
+ model.name_y="%s_%s"%(str(r),name_y)
+ model.name_x=["%s_%s"%(str(r),i)foriinname_x]
+ model.name_yend=["%s_%s"%(str(r),i)foriinname_yend]
+ model.name_z=model.name_x+model.name_yend+[str(r)+"_lambda"]
+ model.name_q=["%s_%s"%(str(r),i)foriinname_q]
+ model.name_h=model.name_x+model.name_q
+ model.name_w=name_w
+ model.name_regimes=name_regimes
+ returnmodel
+
+
+def_test():
+ importdoctest
+
+ start_suppress=np.get_printoptions()["suppress"]
+ np.set_printoptions(suppress=True)
+ doctest.testmod()
+ np.set_printoptions(suppress=start_suppress)
+
+
+if__name__=="__main__":
+ _test()
+ importnumpyasnp
+ importlibpysal
+
+ db=libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),"r")
+ y=np.array(db.by_col("HOVAL"))
+ y=np.reshape(y,(49,1))
+ X=[]
+ X.append(db.by_col("INC"))
+ X=np.array(X).T
+ yd=[]
+ yd.append(db.by_col("CRIME"))
+ yd=np.array(yd).T
+ q=[]
+ q.append(db.by_col("DISCBD"))
+ q=np.array(q).T
+
+ r_var="NSA"
+ regimes=db.by_col(r_var)
+
+ w=libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ w.transform="r"
+ # reg = GM_Error_Regimes(y, X, regimes, w=w, name_x=['inc'], name_y='hoval', name_ds='columbus',
+ # regime_err_sep=True)
+ # reg = GM_Endog_Error_Regimes(y, X, yd, q, regimes, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'],
+ # name_q=['discbd'], name_ds='columbus', regime_err_sep=True)
+ reg=GM_Combo_Regimes(
+ y,
+ X,
+ regimes,
+ yd,
+ q,
+ w=w,
+ name_x=["inc"],
+ name_y="hoval",
+ name_yend=["crime"],
+ name_q=["discbd"],
+ name_ds="columbus",
+ regime_err_sep=True,
+ regime_lag_sep=True,
+ )
+ print(reg.output)
+ print(reg.summary)
+
+"""
+ML Estimation of Spatial Error Model
+"""
+
+__author__="Luc Anselin luc.anselin@asu.edu,\
+ Serge Rey srey@asu.edu, \
+ Levi Wolf levi.john.wolf@asu.edu"
+
+importnumpyasnp
+importnumpy.linalgasla
+fromscipyimportsparseassp
+fromscipy.sparse.linalgimportspluasSuperLU
+from.utilsimportRegressionPropsY,RegressionPropsVM,set_warn,get_lags
+from.importdiagnosticsasDIAG
+from.importuser_outputasUSER
+from.importregimesasREGI
+from.w_utilsimportsymmetrize
+importpandasaspd
+from.outputimportoutput,_nonspat_top
+
+try:
+ fromscipy.optimizeimportminimize_scalar
+
+ minimize_scalar_available=True
+exceptImportError:
+ minimize_scalar_available=False
+from.sputilsimportspdot,spfill_diagonal,spinv
+fromlibpysalimportweights
+
+__all__=["ML_Error"]
+
+
+classBaseML_Error(RegressionPropsY,RegressionPropsVM,REGI.Regimes_Frame):
+"""
+ ML estimation of the spatial error model (note no consistency
+ checks, diagnostics or constants added): :cite:`Anselin1988`
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ method : string
+ if 'full', brute force calculation (full matrix expressions)
+ if 'ord', Ord eigenvalue calculation
+ if 'LU', LU decomposition for sparse matrices
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and inverse_product
+ regimes_att : dictionary
+ Dictionary containing elements to be used in case of a regimes model,
+ i.e. 'x' before regimes, 'regimes' list and 'cols2regi'
+
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ lam : float
+ estimate of spatial autoregressive coefficient
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant, excluding the rho)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ method : string
+ log Jacobian method
+ if 'full': brute force (full matrix computations)
+ if 'ord' : Ord eigenvalue method
+ epsilon : float
+ tolerance criterion used in minimize_scalar function and inverse_product
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+1 x k+1) - includes lambda
+ vm1 : array
+ 2x2 array of variance covariance for lambda, sigma
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> from libpysal.examples import load_example
+ >>> import spreg
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+ >>> south = load_example('South')
+ >>> db = libpysal.io.open(south.get_path("south.dbf"),'r')
+ >>> y_name = "HR90"
+ >>> y = np.array(db.by_col(y_name))
+ >>> y.shape = (len(y),1)
+ >>> x_names = ["RD90","PS90","UE90","DV90"]
+ >>> x = np.array([db.by_col(var) for var in x_names]).T
+ >>> x = np.hstack((np.ones((len(y),1)),x))
+ >>> w = libpysal.weights.Queen.from_shapefile(south.get_path("south.shp"))
+ >>> w.transform = 'r'
+ >>> mlerr = spreg.ml.error.BaseML_Error(y,x,w) #doctest: +SKIP
+ >>> "{0:.6f}".format(mlerr.lam) #doctest: +SKIP
+ '0.299078'
+ >>> np.around(mlerr.betas, decimals=4) #doctest: +SKIP
+ array([[ 6.1492],
+ [ 4.4024],
+ [ 1.7784],
+ [-0.3781],
+ [ 0.4858],
+ [ 0.2991]])
+ >>> "{0:.6f}".format(mlerr.mean_y) #doctest: +SKIP
+ '9.549293'
+ >>> "{0:.6f}".format(mlerr.std_y) #doctest: +SKIP
+ '7.038851'
+ >>> np.diag(mlerr.vm) #doctest: +SKIP
+ array([ 1.06476526, 0.05548248, 0.04544514, 0.00614425, 0.01481356,
+ 0.00143001])
+ >>> "{0:.6f}".format(mlerr.sig2[0][0]) #doctest: +SKIP
+ '32.406854'
+ >>> "{0:.6f}".format(mlerr.logll) #doctest: +SKIP
+ '-4471.407067'
+ >>> mlerr1 = BaseML_Error(y,x,w,method='ord') #doctest: +SKIP
+ >>> "{0:.6f}".format(mlerr1.lam) #doctest: +SKIP
+ '0.299078'
+ >>> np.around(mlerr1.betas, decimals=4) #doctest: +SKIP
+ array([[ 6.1492],
+ [ 4.4024],
+ [ 1.7784],
+ [-0.3781],
+ [ 0.4858],
+ [ 0.2991]])
+ >>> "{0:.6f}".format(mlerr1.mean_y) #doctest: +SKIP
+ '9.549293'
+ >>> "{0:.6f}".format(mlerr1.std_y) #doctest: +SKIP
+ '7.038851'
+ >>> np.around(np.diag(mlerr1.vm), decimals=4) #doctest: +SKIP
+ array([ 1.0648, 0.0555, 0.0454, 0.0061, 0.0148, 0.0014])
+ >>> "{0:.4f}".format(mlerr1.sig2[0][0]) #doctest: +SKIP
+ '32.4069'
+ >>> "{0:.4f}".format(mlerr1.logll) #doctest: +SKIP
+ '-4471.4071'
+
+ """
+
+ def__init__(self,y,x,w,method="full",epsilon=0.0000001,regimes_att=None):
+ # set up main regression variables and spatial filters
+ self.y=y
+ ifregimes_att:
+ self.x=x.toarray()
+ else:
+ self.x=x
+ self.n,self.k=self.x.shape
+ self.method=method
+ self.epsilon=epsilon
+
+ # W = w.full()[0] #wait to build pending what is needed
+ # Wsp = w.sparse
+
+ ylag=weights.lag_spatial(w,self.y)
+ xlag=self.get_x_lag(w,regimes_att)
+
+ # call minimizer using concentrated log-likelihood to get lambda
+ methodML=method.upper()
+ ifmethodMLin["FULL","LU","ORD"]:
+ ifmethodML=="FULL":
+ W=w.full()[0]# need dense here
+ res=minimize_scalar(
+ err_c_loglik,
+ 0.0,
+ bounds=(-1.0,1.0),
+ args=(self.n,self.y,ylag,self.x,xlag,W),
+ method="bounded",
+ tol=epsilon,
+ )
+ elifmethodML=="LU":
+ I=sp.identity(w.n)
+ Wsp=w.sparse# need sparse here
+ res=minimize_scalar(
+ err_c_loglik_sp,
+ 0.0,
+ bounds=(-1.0,1.0),
+ args=(self.n,self.y,ylag,self.x,xlag,I,Wsp),
+ method="bounded",
+ tol=epsilon,
+ )
+ W=Wsp
+ elifmethodML=="ORD":
+ # check on symmetry structure
+ ifw.asymmetry(intrinsic=False)==[]:
+ ww=symmetrize(w)
+ WW=np.array(ww.todense())
+ evals=la.eigvalsh(WW)
+ W=WW
+ else:
+ W=w.full()[0]# need dense here
+ evals=la.eigvals(W)
+ res=minimize_scalar(
+ err_c_loglik_ord,
+ 0.0,
+ bounds=(-1.0,1.0),
+ args=(self.n,self.y,ylag,self.x,xlag,evals),
+ method="bounded",
+ tol=epsilon,
+ )
+ else:
+ raiseException("{0} is an unsupported method".format(method))
+
+ self.lam=res.x
+
+ # compute full log-likelihood, including constants
+ ln2pi=np.log(2.0*np.pi)
+ llik=-res.fun-self.n/2.0*ln2pi-self.n/2.0
+
+ self.logll=llik
+
+ # b, residuals and predicted values
+
+ ys=self.y-self.lam*ylag
+ xs=self.x-self.lam*xlag
+ xsxs=np.dot(xs.T,xs)
+ xsxsi=np.linalg.inv(xsxs)
+ xsys=np.dot(xs.T,ys)
+ b=np.dot(xsxsi,xsys)
+
+ self.betas=np.vstack((b,self.lam))
+
+ self.u=y-np.dot(self.x,b)
+ self.predy=self.y-self.u
+
+ # residual variance
+
+ self.e_filtered=self.u-self.lam*weights.lag_spatial(w,self.u)
+ self.sig2=np.dot(self.e_filtered.T,self.e_filtered)/self.n
+
+ # variance-covariance matrix betas
+
+ varb=self.sig2*xsxsi
+
+ # variance-covariance matrix lambda, sigma
+
+ a=-self.lam*W
+ spfill_diagonal(a,1.0)
+ ai=spinv(a)
+ wai=spdot(W,ai)
+ tr1=wai.diagonal().sum()
+
+ wai2=spdot(wai,wai)
+ tr2=wai2.diagonal().sum()
+
+ waiTwai=spdot(wai.T,wai)
+ tr3=waiTwai.diagonal().sum()
+
+ v1=np.vstack((tr2+tr3,tr1/self.sig2))
+ v2=np.vstack((tr1/self.sig2,self.n/(2.0*self.sig2**2)))
+
+ v=np.hstack((v1,v2))
+
+ self.vm1=np.linalg.inv(v)
+
+ # create variance matrix for beta, lambda
+ vv=np.hstack((varb,np.zeros((self.k,1))))
+ vv1=np.hstack((np.zeros((1,self.k)),self.vm1[0,0]*np.ones((1,1))))
+
+ self.vm=np.vstack((vv,vv1))
+
+ defget_x_lag(self,w,regimes_att):
+ ifregimes_att:
+ xlag=weights.lag_spatial(w,regimes_att["x"])
+ xlag=REGI.Regimes_Frame.__init__(
+ self,
+ xlag,
+ regimes_att["regimes"],
+ constant_regi=None,
+ cols2regi=regimes_att["cols2regi"],
+ )[0]
+ xlag=xlag.toarray()
+ else:
+ xlag=weights.lag_spatial(w,self.x)
+ returnxlag
+
+
+
+[docs]
+classML_Error(BaseML_Error):
+"""
+ ML estimation of the spatial error model with all results and diagnostics;
+ :cite:`Anselin1988`
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ method : string
+ if 'full', brute force calculation (full matrix expressions)
+ if 'ord', Ord eigenvalue method
+ if 'LU', LU sparse matrix decomposition
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and inverse_product
+ vm : boolean
+ if True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ betas : array
+ (k+1)x1 array of estimated coefficients (rho first)
+ lam : float
+ estimate of spatial autoregressive coefficient
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant, excluding lambda)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ method : string
+ log Jacobian method
+ if 'full': brute force (full matrix computations)
+ epsilon : float
+ tolerance criterion used in minimize_scalar function and inverse_product
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ varb : array
+ Variance covariance matrix (k+1 x k+1) - includes var(lambda)
+ vm1 : array
+ variance covariance matrix for lambda, sigma (2 x 2)
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ utu : float
+ Sum of squared residuals
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+ >>> from libpysal.weights import Queen
+ >>> from spreg import ML_Error
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+ >>> south = load_example('South')
+ >>> db = libpysal.io.open(south.get_path("south.dbf"),'r')
+ >>> y_name = "HR90"
+ >>> y = np.array(db.by_col(y_name))
+ >>> y.shape = (len(y),1)
+ >>> x_names = ["RD90","PS90","UE90","DV90"]
+ >>> x = np.array([db.by_col(var) for var in x_names]).T
+ >>> w = Queen.from_shapefile(south.get_path("south.shp"))
+ >>> w_name = "south_q.gal"
+ >>> w.transform = 'r'
+ >>> mlerr = ML_Error(y,x,w,name_y=y_name,name_x=x_names,\
+ name_w=w_name,name_ds=ds_name) #doctest: +SKIP
+ >>> np.around(mlerr.betas, decimals=4) #doctest: +SKIP
+ array([[ 6.1492],
+ [ 4.4024],
+ [ 1.7784],
+ [-0.3781],
+ [ 0.4858],
+ [ 0.2991]])
+ >>> "{0:.4f}".format(mlerr.lam) #doctest: +SKIP
+ '0.2991'
+ >>> "{0:.4f}".format(mlerr.mean_y) #doctest: +SKIP
+ '9.5493'
+ >>> "{0:.4f}".format(mlerr.std_y) #doctest: +SKIP
+ '7.0389'
+ >>> np.around(np.diag(mlerr.vm), decimals=4) #doctest: +SKIP
+ array([ 1.0648, 0.0555, 0.0454, 0.0061, 0.0148, 0.0014])
+ >>> np.around(mlerr.sig2, decimals=4) #doctest: +SKIP
+ array([[ 32.4069]])
+ >>> "{0:.4f}".format(mlerr.logll) #doctest: +SKIP
+ '-4471.4071'
+ >>> "{0:.4f}".format(mlerr.aic) #doctest: +SKIP
+ '8952.8141'
+ >>> "{0:.4f}".format(mlerr.schwarz) #doctest: +SKIP
+ '8979.0779'
+ >>> "{0:.4f}".format(mlerr.pr2) #doctest: +SKIP
+ '0.3058'
+ >>> "{0:.4f}".format(mlerr.utu) #doctest: +SKIP
+ '48534.9148'
+ >>> np.around(mlerr.std_err, decimals=4) #doctest: +SKIP
+ array([ 1.0319, 0.2355, 0.2132, 0.0784, 0.1217, 0.0378])
+ >>> np.around(mlerr.z_stat, decimals=4) #doctest: +SKIP
+ array([[ 5.9593, 0. ],
+ [ 18.6902, 0. ],
+ [ 8.3422, 0. ],
+ [ -4.8233, 0. ],
+ [ 3.9913, 0.0001],
+ [ 7.9089, 0. ]])
+ >>> mlerr.name_y #doctest: +SKIP
+ 'HR90'
+ >>> mlerr.name_x #doctest: +SKIP
+ ['CONSTANT', 'RD90', 'PS90', 'UE90', 'DV90', 'lambda']
+ >>> mlerr.name_w #doctest: +SKIP
+ 'south_q.gal'
+ >>> mlerr.name_ds #doctest: +SKIP
+ 'south.dbf'
+ >>> mlerr.title #doctest: +SKIP
+ 'MAXIMUM LIKELIHOOD SPATIAL ERROR (METHOD = FULL)'
+
+
+ """
+
+
+"""
+ML Estimation of Spatial Error Model
+"""
+
+__author__="Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu"
+
+importlibpysal
+importnumpyasnp
+importmultiprocessingasmp
+from.importregimesasREGI
+from.importuser_outputasUSER
+from.importdiagnosticsasDIAG
+from.utilsimportset_warn,get_lags
+from.sputilsimportsphstack
+from.ml_errorimportBaseML_Error
+fromplatformimportsystem
+importpandasaspd
+from.outputimportoutput,_nonspat_top
+
+__all__=["ML_Error_Regimes"]
+
+
+
+[docs]
+classML_Error_Regimes(BaseML_Error,REGI.Regimes_Frame):
+"""
+ ML estimation of the spatial error model with regimes (note no consistency
+ checks, diagnostics or constants added); :cite:`Anselin1988`
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX-Error type.
+ method : string
+ if 'full', brute force calculation (full matrix expressions)
+ if 'ord', Ord eigenvalue computation
+ if 'LU', LU sparse matrix decomposition
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and inverse_product
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ regime_lag_sep: boolean
+ Always False, kept for consistency in function call, ignored.
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ vm : boolean
+ if True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ latex : boolean
+ Specifies if the table with the coefficients' results and their inference is to be printed in LaTeX format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ (k+1)x1 array of estimated coefficients (lambda last)
+ lam : float
+ estimate of spatial autoregressive coefficient
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant, excluding the rho)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ method : string
+ log Jacobian method.
+ if 'full': brute force (full matrix computations)
+ if 'ord', Ord eigenvalue computation
+ if 'LU', LU sparse matrix decomposition
+ epsilon : float
+ tolerance criterion used in minimize_scalar function and inverse_product
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+1 x k+1), all coefficients
+ vm1 : array
+ variance covariance matrix for lambda, sigma (2 x 2)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ logll : float
+ maximized log-likelihood (including constant terms)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_lag_sep: boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed accross regimes.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ Open data baltim.dbf using pysal and create the variables matrices and weights matrix.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+ >>> from libpysal.weights import Queen
+ >>> from spreg import ML_Error_Regimes
+ >>> import geopandas as gpd
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+ >>> baltimore = load_example('Baltimore')
+ >>> db = libpysal.io.open(baltimore.get_path("baltim.dbf"),'r')
+ >>> df = gpd.read_file(baltimore.get_path("baltim.shp"))
+ >>> ds_name = "baltim.dbf"
+ >>> y_name = "PRICE"
+ >>> y = np.array(db.by_col(y_name)).T
+ >>> y.shape = (len(y),1)
+ >>> x_names = ["NROOM","AGE","SQFT"]
+ >>> x = np.array([db.by_col(var) for var in x_names]).T
+ >>> w = Queen.from_dataframe(df)
+ >>> w_name = "baltim_q.gal"
+ >>> w.transform = 'r'
+
+ Since in this example we are interested in checking whether the results vary
+ by regimes, we use CITCOU to define whether the location is in the city or
+ outside the city (in the county):
+
+ >>> regimes = db.by_col("CITCOU")
+
+ Now we can run the regression with all parameters:
+
+ >>> mlerr = ML_Error_Regimes(y,x,regimes,w=w,name_y=y_name,name_x=x_names,\
+ name_w=w_name,name_ds=ds_name,name_regimes="CITCOU")
+ >>> np.around(mlerr.betas, decimals=4)
+ array([[-2.076 ],
+ [ 4.8615],
+ [-0.0295],
+ [ 0.3355],
+ [32.3457],
+ [ 2.8708],
+ [-0.2401],
+ [ 0.799 ],
+ [ 0.6 ]])
+ >>> "{0:.6f}".format(mlerr.lam)
+ '0.599951'
+ >>> "{0:.6f}".format(mlerr.mean_y)
+ '44.307180'
+ >>> "{0:.6f}".format(mlerr.std_y)
+ '23.606077'
+ >>> np.around(mlerr.vm1, decimals=4)
+ array([[ 0.0053, -0.3643],
+ [ -0.3643, 465.3559]])
+ >>> np.around(np.diag(mlerr.vm), decimals=4)
+ array([58.7121, 2.5036, 0.0074, 0.0659, 81.9796, 3.2676, 0.0124,
+ 0.0514, 0.0053])
+ >>> np.around(mlerr.sig2, decimals=4)
+ array([[215.554]])
+ >>> "{0:.6f}".format(mlerr.logll)
+ '-872.860883'
+ >>> "{0:.6f}".format(mlerr.aic)
+ '1761.721765'
+ >>> "{0:.6f}".format(mlerr.schwarz)
+ '1788.536630'
+ >>> mlerr.title
+ 'MAXIMUM LIKELIHOOD SPATIAL ERROR - REGIMES (METHOD = full)'
+ """
+
+
+"""
+ML Estimation of Spatial Lag Model with Regimes
+"""
+
+__author__="Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu"
+
+importnumpyasnp
+from.importregimesasREGI
+from.importuser_outputasUSER
+from.importdiagnosticsasDIAG
+importmultiprocessingasmp
+from.ml_lagimportBaseML_Lag
+from.utilsimportset_warn,get_lags
+importpandasaspd
+from.outputimportoutput,_nonspat_top,_spat_diag_out,_spat_pseudo_r2,_summary_impacts
+
+
+__all__=["ML_Lag_Regimes"]
+
+
+
+[docs]
+classML_Lag_Regimes(BaseML_Lag,REGI.Regimes_Frame):
+
+"""
+ ML estimation of the spatial lag model with regimes (note no consistency
+ checks, diagnostics or constants added) :cite:`Anselin1988`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default)
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ w : Sparse matrix
+ Spatial weights sparse matrix
+ method : string
+ if 'full', brute force calculation (full matrix expressions)
+ if 'ord', Ord eigenvalue method
+ if 'LU', LU sparse matrix decomposition
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and inverse_product
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the Spatial Durbin type.
+ regime_lag_sep: boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed accross regimes.
+ spat_diag : boolean
+ If True, then compute Common Factor Hypothesis test when applicable
+ spat_impacts : string or list
+ Include average direct impact (ADI), average indirect impact (AII),
+ and average total impact (ATI) in summary results.
+ Options are 'simple', 'full', 'power', 'all' or None.
+ See sputils.spmultiplier for more information.
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ vm : boolean
+ if True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ (k+1)x1 array of estimated coefficients (rho first)
+ rho : float
+ estimate of spatial autoregressive coefficient
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant, excluding the rho)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ method : string
+ log Jacobian method.
+ if 'full': brute force (full matrix computations)
+ if 'ord', Ord eigenvalue method
+ if 'LU', LU sparse matrix decomposition
+ cfh_test : tuple
+ Common Factor Hypothesis test; tuple contains the pair (statistic,
+ p-value). Only when it applies (see specific documentation).
+ epsilon : float
+ tolerance criterion used in minimize_scalar function and inverse_product
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+1 x k+1), all coefficients
+ vm1 : array
+ Variance covariance matrix (k+2 x k+2), includes sig2
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ logll : float
+ maximized log-likelihood (including constant terms)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ aic : float
+ Akaike information criterion
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ schwarz : float
+ Schwarz criterion
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ predy_e : array
+ predicted values from reduced form
+ e_pred : array
+ prediction errors using reduced form predicted values
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sp_multipliers: dict
+ Dictionary of spatial multipliers (if spat_impacts is not None)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: ['one', 'many']
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes
+
+ * 'many': a vector of ones is appended to x and considered different per regime
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_lag_sep: boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed accross regimes.
+ regime_err_sep: boolean
+ always set to False - kept for compatibility with other
+ regime models
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ Open data baltim.dbf using pysal and create the variables matrices and weights matrix.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+ >>> from libpysal.examples import load_example
+ >>> from libpysal.weights import Queen
+ >>> from spreg import ML_Lag_Regimes
+ >>> import geopandas as gpd
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+ >>> baltimore = load_example('Baltimore')
+ >>> db = libpysal.io.open(baltimore.get_path("baltim.dbf"),'r')
+ >>> df = gpd.read_file(baltimore.get_path("baltim.shp"))
+ >>> ds_name = "baltim.dbf"
+ >>> y_name = "PRICE"
+ >>> y = np.array(db.by_col(y_name)).T
+ >>> y.shape = (len(y),1)
+ >>> x_names = ["NROOM","AGE","SQFT"]
+ >>> x = np.array([db.by_col(var) for var in x_names]).T
+ >>> w = Queen.from_dataframe(df)
+ >>> w_name = "baltim_q.gal"
+ >>> w.transform = 'r'
+
+ Since in this example we are interested in checking whether the results vary
+ by regimes, we use CITCOU to define whether the location is in the city or
+ outside the city (in the county):
+
+ >>> regimes = db.by_col("CITCOU")
+
+ Now we can run the regression with all parameters:
+
+ >>> mllag = ML_Lag_Regimes(y,x,regimes,w=w,name_y=y_name,name_x=x_names,\
+ name_w=w_name,name_ds=ds_name,name_regimes="CITCOU")
+ >>> np.around(mllag.betas, decimals=4)
+ array([[-14.5158],
+ [ 4.4923],
+ [ -0.0336],
+ [ 0.3541],
+ [ -3.601 ],
+ [ 3.8736],
+ [ -0.1747],
+ [ 0.8238],
+ [ 0.525 ]])
+ >>> "{0:.6f}".format(mllag.rho)
+ '0.524971'
+ >>> "{0:.6f}".format(mllag.mean_y)
+ '44.307180'
+ >>> "{0:.6f}".format(mllag.std_y)
+ '23.606077'
+ >>> np.around(np.diag(mllag.vm1), decimals=4)
+ array([ 48.6818, 2.4524, 0.0052, 0.0663, 71.4439, 3.2837,
+ 0.0118, 0.0498, 0.0042, 409.1225])
+ >>> np.around(np.diag(mllag.vm), decimals=4)
+ array([48.6818, 2.4524, 0.0052, 0.0663, 71.4439, 3.2837, 0.0118,
+ 0.0498, 0.0042])
+ >>> "{0:.6f}".format(mllag.sig2)
+ '204.827093'
+ >>> "{0:.6f}".format(mllag.logll)
+ '-867.086467'
+ >>> "{0:.6f}".format(mllag.aic)
+ '1752.172934'
+ >>> "{0:.6f}".format(mllag.schwarz)
+ '1782.339657'
+ >>> mllag.title
+ 'MAXIMUM LIKELIHOOD SPATIAL LAG - REGIMES (METHOD = full)'
+ """
+
+
+"""
+Estimation of Nonlinear SLX Model
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, \
+ Pedro V. Amaral pedrovma@gmail.com"
+
+importnumpyasnp
+importpandasaspd
+fromscipy.sparseimportcoo_array,csr_array
+fromscipy.optimizeimportminimize
+fromlibpysal.cgimportKDTree
+from.importuser_outputasUSER
+importnumpy.linalgasla
+from.utilsimportmake_wnslx,RegressionPropsY,set_warn
+from.outputimportoutput,_nslx_out
+from.diagnosticsimportlog_likelihood,akaike,schwarz
+fromitertoolsimportcompress
+
+__all__=["NSLX"]
+
+
+classBaseNSLX(RegressionPropsY):
+'''
+ Estimation of the nonlinear SLX model (note no consistency
+ checks, diagnostics or constants added) - inverse distance
+ power function and negative exponential distance function supported
+
+ Parameters
+ ----------
+ y : n by 1 numpy array with dependent variable
+ x : n by k array with explanatory variables, includes constant
+ xw : n by h array with selected columns of X that will have the
+ W(alpha) transformation applied to them
+ w : list of sparse CSR arrays to use in lag transformation, if
+ list has a single element, same weights applied for all
+ transform : tuple of transformations, either "exponential" or "power"
+ when same transformation applies to all, tuple is a single
+ element tuple (default is "power")
+ var_flag : flag for analytical computation of variance-covariance matrix
+ passed from NSLX
+ verbose : option for nonlinear optimization, either False (default) or
+ True
+ options : options specific to scipy minimize, such as {"disp":True}
+ (see scipy minimize docs)
+
+ Attributes
+ ----------
+ y : n by 1 numpy array with dependent variable
+ x : n by k array with explanatory variables, includes constant
+ xw : n by h array with selected columns of X that will have the
+ W(alpha) transformation applied to them
+ w : list of sparse CSR arrays to use in lag transformation, if
+ list has a single element, same weights applied for all
+ n : number of observations
+ k : number of explanatory variables in X (includes constant)
+ transform : tuple of transformations, either "power" or "exponential"
+ when same transformation applies to all, tuple is a single
+ element tuple (default is "power")
+ verbose : option for nonlinear optimization, either False (default) or
+ True
+ options : options specific to scipy minimize, such as {"disp":True}
+ (see scipy minimize docs)
+ betas : numpy array with parameter estimates
+ utu : sum of squared residuals
+ ihess : inverse of Hessian matrix
+ sign : estimate of residual variance (divided by n)
+ sig2 : same as sign
+ vm : coefficient variance-covariance matrix (sign x ihess)
+ predy : vector of predicted values
+ u : vector of residuals
+
+ '''
+
+ def__init__(self,y,x,xw,w,transform,var_flag,verbose,options):
+ self.y=y
+ self.x=x
+ self.xw=xw
+ self.n=self.x.shape[0]
+ h=self.xw.shape[1]
+ kk=self.x.shape[1]
+ self.k=kk+h
+
+ self.w=w
+ self.transform=transform
+ self.verbose=verbose
+ self.options=options
+
+ h=self.xw.shape[1]
+
+ xty=self.x.T@self.y
+ xtx=self.x.T@self.x
+ b0=la.solve(xtx,xty)
+
+ alpha0=np.ones((h,1))# initial value
+ g0=np.vstack((b0,alpha0))
+ gamma0=g0.flatten()
+ gradflag=0
+
+ ssmin=minimize(nslxobj,gamma0,
+ args=(self.y,self.x,self.xw,self.w,self.transform,self.verbose),
+ options=self.options)
+
+ self.betas=ssmin.x
+ self.utu=ssmin.fun
+ self.sign=self.utu/self.n
+ self.sig2=self.sign
+
+ # convergence criteria
+ self.success=ssmin.success
+ self.status=ssmin.status
+ self.message=ssmin.message
+ self.nit=ssmin.nit
+
+ # compute predy
+ b=self.betas[0:-h]
+ alpha=self.betas[-h:]
+
+ wx=nlmod(alpha,xw,w,transform,gradflag=0)
+ wxs=np.sum(wx,axis=1).reshape(-1,1)
+ xb=x@b.reshape(-1,1)
+ predy=xb+wxs
+ self.predy=predy
+ self.u=self.y-self.predy
+
+ # compute variance from gradients
+ ifvar_flag:
+ vb=np.zeros((self.k,self.k))
+ xtxi=la.inv(xtx)
+ vb[:kk,:kk]=xtxi
+ wax=nlmod(alpha,xw,w,transform,gradflag=1)
+ waxtwax=wax.T@wax
+ try:
+ waxtwaxi=la.inv(waxtwax)
+ except:
+ raiseException("Singular variance matrix for nonlinear part")
+ vb[-h:,-h:]=waxtwaxi
+ self.vm=vb*self.sig2
+ elifvar_flag==0:
+ self.ihess=ssmin.hess_inv
+ self.vm=self.ihess*self.sig2
+
+
+
+
+
+
+
+[docs]
+classNSLX(BaseNSLX):
+
+'''
+ Estimation of the nonlinear SLX model - inverse distance
+ power function and negative exponential distance function supported
+ Includes output of all results.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ coords : an n by 2 array or a selection of two columns from a data frame
+ params : a list of tuples containing the two parameters for the construction
+ of the distance weights and the transformation:
+ (k,distance_upper_bound,transformation)
+ if the list consists of a single element, the same parameters are
+ applied to all transformations
+ default is [(10,np.inf,"exponential")] for 10 knn neighbors, variable
+ bandwidth and exponential transformation
+ (see make_wnslx in UTILS)
+ distance_metric: metric for distance computations, either "Euclidean" (default) or "Arc"
+ (for decimal lat-lon degrees)
+ leafsize : parameter used to creat KDTree, default is 30
+ slx_vars : list with True,False for selection of X variables to which SLX should be applied
+ default is "All"
+ var_flag : flag for variance computation, default = 1 - uses analytical derivation,
+ = 0 - uses numerical approximation with inverse hessian
+ conv_flag : flag for convergence diagnostics, default = 0 for no diagnostics
+ = 1 - prints our minimize convergence summary
+ verbose : boolean for intermediate results in nonlinear optimization, default is False
+ options : options specific to scipy minimize, such as {"disp":True}
+ (see scipy minimize docs)
+ vm : boolean
+ if True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_coords : list of strings
+ Names of coordinate variables used in distance matrix
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ y : n by 1 numpy array with dependent variable
+ x : n by k array with explanatory variables, includes constant
+ xw : n by h array with selected columns of X that will have the
+ W(alpha) transformation applied to them
+ w : list of sparse CSR arrays to use in lag transformation, if
+ list has a single element, same weights applied for all
+ n : number of observations
+ k : number of explanatory variables in X (includes constant)
+ transform : tuple of transformations, either "power" or "exponential"
+ when same transformation applies to all, tuple is a single
+ element tuple (default is "power")
+ verbose : option for nonlinear optimization, either False (default) or
+ True
+ options : options specific to scipy minimize, such as {"disp":True}
+ (see scipy minimize docs)
+ betas : numpy array with parameter estimates
+ utu : sum of squared residuals
+ ihess : inverse of Hessian matrix
+ sign : estimate of residual variance (divided by n)
+ sig2 : same as sign
+ vm : coefficient variance-covariance matrix (sign x ihess)
+ predy : vector of predicted values
+ u : vector of residuals
+ ll : float
+ Log likelihood
+ aic : float
+ Akaike information criterion
+ schwarz : float
+ Schwarz information criterion
+ name_x : variable names for explanatory variables
+ name_ds : data set name
+ name_y : name of dependent variable
+ title : output header
+
+ Example
+ --------
+ >>> import numpy as np
+ >>> import geopandas as gpd
+ >>> from libpysal.examples import load_example, get_path
+ >>> import spreg
+
+ Open data on Chicago census tract SDOH variables from libpysal examples using geopandas.
+ If you don't have chicagoSDOH installed, you can do so by running `load_example('chicagoSDOH')`.
+
+ >>> dfs = gpd.read_file(get_path('Chi-SDOH.shp'))
+
+ For this example, we will use the 'HIS_ct' column (economic hardship index) as the
+ dependent variable and the 'Blk14P', 'Hisp14P', and 'EP_NOHSDP' columns as independent
+ variables. The coordinates "COORD_X" and "COORD_Y" will be used to construct the
+ spatial weights matrix.
+
+ >>> y = dfs[['HIS_ct']]
+ >>> x = dfs[['Blk14P','Hisp14P','EP_NOHSDP']]
+ >>> coords = dfs[["COORD_X","COORD_Y"]]
+
+ For the regression, we set var_flag = 1 to obtain the analytical standard errors.
+
+ >>> reg = spreg.NSLX(y, x, coords, var_flag=1)
+
+ We can easily obtain a full summary of all the results nicely formatted and
+ ready to be printed:
+
+ >>> print(reg.summary)
+ REGRESSION RESULTS
+ ------------------
+ <BLANKLINE>
+ SUMMARY OF OUTPUT: NONLINEAR SLX
+ --------------------------------
+ Data set : unknown
+ Dependent Variable : HIS_ct Number of Observations: 791
+ Mean dependent var : 39.7301 Number of Variables : 7
+ S.D. dependent var : 13.8098 Degrees of Freedom : 784
+ Sigma-square : 32.287 Sum squared residual : 25538.9
+ S.E. of regression : 5.682 Log likelihood : -2496.609
+ Schwarz criterion : 5039.931 Akaike info criterion : 5007.218
+ Coordinates : COORD_X, COORD_Y Distance metric : Euclidean
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error z-Statistic Probability
+ ------------------------------------------------------------------------------------
+ CONSTANT 17.90865 0.43461 41.20635 0.00000
+ Blk14P 0.17910 0.00806 22.21475 0.00000
+ Hisp14P 0.05818 0.01525 3.81435 0.00014
+ EP_NOHSDP 0.65462 0.02750 23.80062 0.00000
+ We_Blk14P 17.16669 1.70331 10.07841 0.00000
+ We_Hisp14P 88.30447 81337.66263 0.00109 0.99913
+ We_EP_NOHSDP 10.02114 0.36436 27.50353 0.00000
+ ------------------------------------------------------------------------------------
+ Transformation: exponential
+ KNN: 10
+ Distance upper bound: inf
+ ================================ END OF REPORT =====================================
+
+ '''
+
+
+
+
+
+
+defnslxobj(gamma0,y,x,xw,w,transform,verbose):
+'''
+ Objective function for minimize call, computes the sum of squared residuals in
+ the nonlinear SLX model. Note that all parameters other than gamma0 are passed
+ in an arg parameter to the minimize function
+
+ Parameters
+ ----------
+ gamma0 : current parameter estimates, consists of beta (for X) and alpha (for WX)
+ needs to be a flattened array
+ y : n by 1 vector with observations on the dependent variable
+ x : n by k matrix with observations on X, must include constant vector
+ xw : n by h matrix with columns of X that will be spatially lagged
+ w : list of sparse CSR weights for the X column transformations; if same weights
+ are used for all columns, must be a single element list
+ transform : list of transformations (either "power" or "exponential") to be applied to each
+ column of xw; if same transformation is applied to all, must be a single element
+ list
+ verbose : verbose option, whether or not the intermediate parameter values and residual sum
+ of squares are printed out
+
+ Returns
+ -------
+ res2 : sum of squared residuals
+
+ '''
+ n=xw.shape[0]
+ h=xw.shape[1]
+ ifverbose:
+ print("gamma0",gamma0)
+ b0=gamma0[0:-h]
+ alpha0=gamma0[-h:]
+ # create WX
+ wx=nlmod(alpha0,xw,w,transform,gradflag=0)
+ wxs=np.sum(wx,axis=1).reshape(-1,1)
+ xb=x@b0.reshape(-1,1)
+ res=y-xb-wxs
+ res2=res.T@res
+ ifverbose:
+ print("res2",res2)
+
+ returnres2
+
+defnlmod(alpha,xw,w,transform,gradflag=0):
+'''
+ Constructs the matrix of spatially lagged X variables W(a)X (for gradflag = 0) and
+ the gradient matrix d(W(x))/d(a) (for gradflag = 1), for possibly different
+ alpha parameters, different weights and different transformations (the transformations
+ must match the weights and are not checked for compatibility). Calls nltransform for
+ each relevant column of X. This allows the possibility that not all columns of X are
+ used, defined by slx_vars.
+
+ Parameters
+ ----------
+ alpha : array with alpha parameters, same number as relevant columns in X
+ must be flattened (not a vector)
+ xw : matrix with relevant columns of X to be lagged
+ w : a list containing the weights matrix (as sparse CSR) for each column
+ if the same weights matrix are used for all columns, w is a single element list,
+ otherwise, the number of weights must match the number of relevant columns in X and
+ the number of elements in the transform tuple
+ transform : a tuple with the transformations for each relevant column in X, either "power" or "exponential"
+ if the same transformation is used for all columns, transform is a single element tuple
+ (transform,), otherwise, the number of elements in the tuple must match the number of relevant
+ columns in X and the number of elements in the weights matrix tuple
+ the transformation must match the type of weights, but this is not checked
+ gradflag : flag for computation of gradient matrix, = 0 by default (actual function)
+ = 1 - computes matrix of first partial derivatives with respect to alpha
+
+ Returns
+ -------
+ wx : matrix with spatially lagged X variables
+
+ '''
+ # alpha must be flattened
+ h=len(alpha)# number of parameters
+ ifxw.shape[1]!=h:
+ raiseException("Incompatible dimensions")
+ g=len(w)# number of weights
+ iflen(transform)!=g:
+ raiseException("Incompatible dimensions")
+
+ # initialize matrix of spatial lags
+ n=xw.shape[0]
+ wx=np.zeros((n,h))
+
+ foriinrange(h):
+
+ ifg==1:
+ walpha=_nltransform(alpha[i],w[0],transform[0],gradflag=gradflag)# element of single element tuple
+ elifg>1:
+ walpha=_nltransform(alpha[i],w[i],transform[i],gradflag=gradflag)
+ else:
+ raiseException("Operation not supported")
+ wx[:,i]=walpha@xw[:,i]
+ returnwx
+
+
+def_nltransform(a,w,transform,gradflag=0):
+'''
+ Constructs the transformed CSR sparse array for power and exponential transformation
+ for a given alpha parameter, input array and transformation. Note that the alpha parameters
+ are positive, but are used as negative powers in the exponential transformation.
+
+ Parameters
+ ----------
+ a : alpha coefficient as a (positive) scalar
+ w : CSR sparse array with weights
+ transform : transformation, either "power" or "exponential"
+ gradflag : flag for gradient computation, 0 = function, default
+ 1 = gradient
+
+ Returns
+ -------
+ walpha : CSR sparse array with transformed weights
+
+ '''
+ walpha=w.copy()
+
+ iftransform.lower()=="exponential":
+
+ wdata=walpha
+ awdata=wdata*a
+
+ ifgradflag==0:
+
+ awdata=-awdata
+ np.exp(awdata.data,out=awdata.data)
+ walpha=awdata
+
+ elifgradflag==1:
+
+ ww=-awdata
+ np.log(awdata.data,out=awdata.data)
+ wln=awdata
+ w1=wln.multiply(wdata)
+ np.exp(ww.data,out=ww.data)
+ wgrad=ww.multiply(w1)
+ wgrad=-wgrad
+ walpha=wgrad
+
+ eliftransform.lower()=="power":
+
+ ifgradflag==0:
+ walpha=walpha.power(a)
+ elifgradflag==1:
+ walpha=walpha.power(a)
+ ww=w.copy()
+ np.log(ww.data,out=ww.data)
+ wgrad=walpha.multiply(ww)
+ walpha=wgrad
+
+
+ else:
+ raiseException("Transformation not supported")
+
+ returnwalpha
+
+
+
+def_test():
+ importdoctest
+
+ # the following line could be used to define an alternative to the '<BLANKLINE>' flag
+ # doctest.BLANKLINE_MARKER = 'something better than <BLANKLINE>'
+ start_suppress=np.get_printoptions()["suppress"]
+ np.set_printoptions(suppress=True)
+ doctest.testmod()
+ np.set_printoptions(suppress=start_suppress)
+
+
+if__name__=="__main__":
+ _test()
+
+ fromlibpysal.examplesimportload_example,get_path
+ importgeopandasasgpd
+ importspreg
+
+ dfs=gpd.read_file(load_example('columbus').get_path("columbus.shp"))
+ dfs['geometry']=gpd.points_from_xy(dfs['X'],dfs['Y'])# Transforming polygons to points
+ y=dfs[["CRIME"]]
+ x=dfs[["INC","HOVAL","DISCBD"]]
+ reg=spreg.NSLX(y,x,dfs['geometry'])
+ print(reg.output)
+ print(reg.summary)
+
+
+"""Ordinary Least Squares regression classes."""
+
+__author__="Luc Anselin lanselin@gmail.com, Pedro Amaral pedrovma@gmail.com, David C. Folch david.folch@asu.edu"
+importnumpyasnp
+importnumpy.linalgasla
+from.importuser_outputasUSER
+from.outputimportoutput,_spat_diag_out,_nonspat_mid,_nonspat_top,_summary_vif
+from.importrobustasROBUST
+from.utilsimportspdot,RegressionPropsY,RegressionPropsVM,set_warn,get_lags
+importpandasaspd
+fromlibpysalimportweights# needed for check on kernel weights in slx
+
+__all__=["OLS"]
+
+
+classBaseOLS(RegressionPropsY,RegressionPropsVM):
+
+"""
+ Ordinary least squares (OLS) (note: no consistency checks, diagnostics or
+ constant added)
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given. If 'hac', then a
+ HAC consistent estimator of the variance-covariance
+ matrix is given. Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ xtx : float
+ X'X
+ xtxi : float
+ (X'X)^-1
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("CRIME"))
+ >>> X = np.array(X).T
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> ols = spreg.ols.BaseOLS(y,X)
+ >>> ols.betas
+ array([[46.42818268],
+ [ 0.62898397],
+ [-0.48488854]])
+ >>> ols.vm
+ array([[174.02245348, -6.52060364, -2.15109867],
+ [ -6.52060364, 0.28720001, 0.06809568],
+ [ -2.15109867, 0.06809568, 0.03336939]])
+ """
+
+ def__init__(self,y,x,robust=None,gwk=None,sig2n_k=True):
+ self.x=x
+ self.xtx=spdot(self.x.T,self.x)
+ xty=spdot(self.x.T,y)
+
+ self.xtxi=la.inv(self.xtx)
+ self.betas=np.dot(self.xtxi,xty)
+ predy=spdot(self.x,self.betas)
+
+ u=y-predy
+ self.u=u
+ self.predy=predy
+ self.y=y
+ self.n,self.k=self.x.shape
+
+ ifsig2n_k:
+ self.sig2=self.sig2n_k
+ else:
+ self.sig2=self.sig2n
+
+ ifrobustisnotNone:
+ self.vm=ROBUST.robust_vm(reg=self,gwk=gwk,sig2n_k=sig2n_k)
+
+
+
+[docs]
+classOLS(BaseOLS):
+"""
+ Ordinary least squares with results and diagnostics.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : pysal W object
+ Spatial weights object (required if running spatial
+ diagnostics)
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given. If 'hac', then a
+ HAC consistent estimator of the variance-covariance
+ matrix is given. Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+ nonspat_diag : boolean
+ If True, then compute non-spatial diagnostics on
+ the regression.
+ spat_diag : boolean
+ If True, then compute Lagrange multiplier tests (requires
+ w). Note: see moran for further tests.
+ moran : boolean
+ If True, compute Moran's I on the residuals. Note:
+ requires spat_diag=True.
+ white_test : boolean
+ If True, compute White's specification robust test.
+ (requires nonspat_diag=True)
+ vif : boolean
+ If True, compute variance inflation factor.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if the table with the coefficients' results and their inference is to be printed in LaTeX format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ robust : string
+ Adjustment for robust standard errors
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ r2 : float
+ R squared
+ ar2 : float
+ Adjusted R squared
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ sig2ML : float
+ Sigma squared (maximum likelihood)
+ f_stat : tuple
+ Statistic (float), p-value (float)
+ logll : float
+ Log likelihood
+ aic : float
+ Akaike information criterion
+ schwarz : float
+ Schwarz information criterion
+ std_err : array
+ 1xk array of standard errors of the betas
+ t_stat : list of tuples
+ t statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ mulColli : float
+ Multicollinearity condition number
+ jarque_bera : dictionary
+ 'jb': Jarque-Bera statistic (float); 'pvalue': p-value
+ (float); 'df': degrees of freedom (int)
+ breusch_pagan : dictionary
+ 'bp': Breusch-Pagan statistic (float); 'pvalue': p-value
+ (float); 'df': degrees of freedom (int)
+ koenker_bassett : dictionary
+ 'kb': Koenker-Bassett statistic (float); 'pvalue':
+ p-value (float); 'df': degrees of freedom (int)
+ white : dictionary
+ 'wh': White statistic (float); 'pvalue': p-value (float);
+ 'df': degrees of freedom (int)
+ lm_error : tuple
+ Lagrange multiplier test for spatial error model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float
+ lm_lag : tuple
+ Lagrange multiplier test for spatial lag model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float
+ rlm_error : tuple
+ Robust lagrange multiplier test for spatial error model;
+ tuple contains the pair (statistic, p-value), where each
+ is a float
+ rlm_lag : tuple
+ Robust lagrange multiplier test for spatial lag model;
+ tuple contains the pair (statistic, p-value), where each
+ is a float
+ lm_sarma : tuple
+ Lagrange multiplier test for spatial SARMA model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float
+ moran_res : tuple
+ Moran's I for the residuals; tuple containing the triple
+ (Moran's I, standardized Moran's I, p-value)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ xtx : float
+ :math:`X'X`
+ xtxi : float
+ :math:`(X'X)^{-1}`
+
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import OLS
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; also, the actual OLS class
+ requires data to be passed in as numpy arrays so the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the HOVAL column (home values) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an nx1 numpy array.
+
+ >>> hoval = db.by_col("HOVAL")
+ >>> y = np.array(hoval)
+ >>> y.shape = (len(hoval), 1)
+
+ Extract CRIME (crime) and INC (income) vectors from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). spreg.OLS adds a vector of ones to the
+ independent variables passed in.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("CRIME"))
+ >>> X = np.array(X).T
+
+ The minimum parameters needed to run an ordinary least squares regression
+ are the two numpy arrays containing the independent variable and dependent
+ variables respectively. To make the printed results more meaningful, the
+ user can pass in explicit names for the variables used; this is optional.
+
+ >>> ols = OLS(y, X, name_y='home value', name_x=['income','crime'], name_ds='columbus', white_test=True)
+
+ spreg.OLS computes the regression coefficients and their standard
+ errors, t-stats and p-values. It also computes a large battery of
+ diagnostics on the regression. In this example we compute the white test
+ which by default isn't ('white_test=True'). All of these results can be independently
+ accessed as attributes of the regression object created by running
+ spreg.OLS. They can also be accessed at one time by printing the
+ summary attribute of the regression object. In the example below, the
+ parameter on crime is -0.4849, with a t-statistic of -2.6544 and p-value
+ of 0.01087.
+
+ >>> ols.betas
+ array([[46.42818268],
+ [ 0.62898397],
+ [-0.48488854]])
+ >>> print(round(ols.t_stat[2][0],3))
+ -2.654
+ >>> print(round(ols.t_stat[2][1],3))
+ 0.011
+ >>> print(round(ols.r2,3))
+ 0.35
+
+ Or we can easily obtain a full summary of all the results nicely formatted and
+ ready to be printed:
+
+ >>> print(ols.summary)
+ REGRESSION RESULTS
+ ------------------
+ <BLANKLINE>
+ SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
+ -----------------------------------------
+ Data set : columbus
+ Weights matrix : None
+ Dependent Variable : home value Number of Observations: 49
+ Mean dependent var : 38.4362 Number of Variables : 3
+ S.D. dependent var : 18.4661 Degrees of Freedom : 46
+ R-squared : 0.3495
+ Adjusted R-squared : 0.3212
+ Sum squared residual: 10647 F-statistic : 12.3582
+ Sigma-square : 231.457 Prob(F-statistic) : 5.064e-05
+ S.E. of regression : 15.214 Log likelihood : -201.368
+ Sigma-square ML : 217.286 Akaike info criterion : 408.735
+ S.E of regression ML: 14.7406 Schwarz criterion : 414.411
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error t-Statistic Probability
+ ------------------------------------------------------------------------------------
+ CONSTANT 46.4281827 13.1917570 3.5194844 0.0009867
+ income 0.6289840 0.5359104 1.1736736 0.2465669
+ crime -0.4848885 0.1826729 -2.6544086 0.0108745
+ ------------------------------------------------------------------------------------
+ <BLANKLINE>
+ REGRESSION DIAGNOSTICS
+ MULTICOLLINEARITY CONDITION NUMBER 12.538
+ <BLANKLINE>
+ TEST ON NORMALITY OF ERRORS
+ TEST DF VALUE PROB
+ Jarque-Bera 2 39.706 0.0000
+ <BLANKLINE>
+ DIAGNOSTICS FOR HETEROSKEDASTICITY
+ RANDOM COEFFICIENTS
+ TEST DF VALUE PROB
+ Breusch-Pagan test 2 5.767 0.0559
+ Koenker-Bassett test 2 2.270 0.3214
+ <BLANKLINE>
+ SPECIFICATION ROBUST TEST
+ TEST DF VALUE PROB
+ White 5 2.906 0.7145
+ ================================ END OF REPORT =====================================
+
+ If the optional parameters w and spat_diag are passed to spreg.OLS,
+ spatial diagnostics will also be computed for the regression. These
+ include Lagrange multiplier tests and Moran's I of the residuals. The w
+ parameter is a PySAL spatial weights matrix. In this example, w is built
+ directly from the shapefile columbus.shp, but w can also be read in from a
+ GAL or GWT file. In this case a rook contiguity weights matrix is built,
+ but PySAL also offers queen contiguity, distance weights and k nearest
+ neighbor weights among others. In the example, the Moran's I of the
+ residuals is 0.204 with a standardized value of 2.592 and a p-value of
+ 0.0095.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> ols = OLS(y, X, w, spat_diag=True, moran=True, name_y='home value', name_x=['income','crime'], name_ds='columbus')
+ >>> ols.betas
+ array([[46.42818268],
+ [ 0.62898397],
+ [-0.48488854]])
+
+ >>> print(round(ols.moran_res[0],3))
+ 0.204
+ >>> print(round(ols.moran_res[1],3))
+ 2.592
+ >>> print(round(ols.moran_res[2],4))
+ 0.0095
+
+ """
+
+
+"""
+Ordinary Least Squares regression with regimes.
+"""
+
+__author__="Luc Anselin, Pedro V. Amaral, Daniel Arribas-Bel"
+
+importnumpyasnp
+importmultiprocessingasmp
+importpandasaspd
+from.importregimesasREGI
+from.importuser_outputasUSER
+from.utilsimportset_warn,RegressionProps_basic,spdot,RegressionPropsY,get_lags,optim_k
+from.olsimportBaseOLS
+from.robustimporthac_multi
+from.outputimportoutput,_spat_diag_out,_nonspat_mid,_nonspat_top
+from.skater_regimportSkater_reg
+
+
+[docs]
+classOLS_Regimes(BaseOLS,REGI.Regimes_Frame,RegressionPropsY):
+"""
+ Ordinary least squares with results and diagnostics.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ w : pysal W object
+ Spatial weights object (required if running spatial
+ diagnostics)
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given. If 'hac', then a
+ HAC consistent estimator of the variance-covariance
+ matrix is given. Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX type.
+ Note: WX is computed using the complete weights matrix
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+ nonspat_diag : boolean
+ If True, then compute non-spatial diagnostics on
+ the regression.
+ spat_diag : boolean
+ If True, then compute Lagrange multiplier tests (requires
+ w). Note: see moran for further tests.
+ moran : boolean
+ If True, compute Moran's I on the residuals. Note:
+ requires spat_diag=True.
+ white_test : boolean
+ If True, compute White's specification robust test.
+ (requires nonspat_diag=True)
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ constant_regi: string, optional
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default)
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep : boolean
+ If True, a separate regression is run for each regime.
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ robust : string
+ Adjustment for robust standard errors
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ r2 : float
+ R squared
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ ar2 : float
+ Adjusted R squared
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2ML : float
+ Sigma squared (maximum likelihood)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ f_stat : tuple
+ Statistic (float), p-value (float)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ logll : float
+ Log likelihood
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ aic : float
+ Akaike information criterion
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ schwarz : float
+ Schwarz information criterion
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ t_stat : list of tuples
+ t statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mulColli : float
+ Multicollinearity condition number
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ jarque_bera : dictionary
+ 'jb': Jarque-Bera statistic (float); 'pvalue': p-value
+ (float); 'df': degrees of freedom (int)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ breusch_pagan : dictionary
+ 'bp': Breusch-Pagan statistic (float); 'pvalue': p-value
+ (float); 'df': degrees of freedom (int)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ koenker_bassett: dictionary
+ 'kb': Koenker-Bassett statistic (float); 'pvalue': p-value (float);
+ 'df': degrees of freedom (int). Only available in dictionary
+ 'multi' when multiple regressions (see 'multi' below for details).
+ white : dictionary
+ 'wh': White statistic (float); 'pvalue': p-value (float);
+ 'df': degrees of freedom (int).
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ lm_error : tuple
+ Lagrange multiplier test for spatial error model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ lm_lag : tuple
+ Lagrange multiplier test for spatial lag model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ rlm_error : tuple
+ Robust lagrange multiplier test for spatial error model;
+ tuple contains the pair (statistic, p-value), where each
+ is a float. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ rlm_lag : tuple
+ Robust lagrange multiplier test for spatial lag model;
+ tuple contains the pair (statistic, p-value), where each
+ is a float. Only available in dictionary 'multi' when
+ multiple regressions (see 'multi' below for details)
+ lm_sarma : tuple
+ Lagrange multiplier test for spatial SARMA model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ moran_res : tuple
+ Moran's I for the residuals; tuple containing the triple
+ (Moran's I, standardized Moran's I, p-value)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ title : string
+ Name of the regression method used.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ xtx : float
+ :math:`X'X`. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ xtxi : float
+ :math:`(X'X)^{-1}`. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each observation to
+ a regime. Assumed to be aligned with 'x'.
+ constant_regi : string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime.
+ cols2regi : list
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate.
+ nr : int
+ Number of different regimes in the 'regimes' list.
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import OLS_Regimes
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("NAT.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it
+ the dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = db.by_col(y_var)
+ >>> y = np.array(y)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ We can now run the regression and then have a summary of the output
+ by typing: olsr.summary
+
+ >>> olsr = OLS_Regimes(y, x, regimes, nonspat_diag=False, name_y=y_var, name_x=['PS90','UE90'], name_regimes=r_var, name_ds='NAT')
+ >>> print(olsr.summary)
+ REGRESSION RESULTS
+ ------------------
+ <BLANKLINE>
+ SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES ESTIMATION - REGIME 0
+ ---------------------------------------------------------------
+ Data set : NAT
+ Weights matrix : None
+ Dependent Variable : 0_HR90 Number of Observations: 1673
+ Mean dependent var : 3.3416 Number of Variables : 3
+ S.D. dependent var : 4.6795 Degrees of Freedom : 1670
+ R-squared : 0.1271
+ Adjusted R-squared : 0.1260
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error t-Statistic Probability
+ ------------------------------------------------------------------------------------
+ 0_CONSTANT 0.39643 0.24816 1.59745 0.11035
+ 0_PS90 0.65583 0.09663 6.78728 0.00000
+ 0_UE90 0.48704 0.03629 13.42213 0.00000
+ ------------------------------------------------------------------------------------
+ Regimes variable: SOUTH
+ <BLANKLINE>
+ SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES ESTIMATION - REGIME 1
+ ---------------------------------------------------------------
+ Data set : NAT
+ Weights matrix : None
+ Dependent Variable : 1_HR90 Number of Observations: 1412
+ Mean dependent var : 9.5493 Number of Variables : 3
+ S.D. dependent var : 7.0389 Degrees of Freedom : 1409
+ R-squared : 0.0661
+ Adjusted R-squared : 0.0647
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error t-Statistic Probability
+ ------------------------------------------------------------------------------------
+ 1_CONSTANT 5.59835 0.46895 11.93816 0.00000
+ 1_PS90 1.16210 0.21667 5.36338 0.00000
+ 1_UE90 0.53164 0.05946 8.94164 0.00000
+ ------------------------------------------------------------------------------------
+ Regimes variable: SOUTH
+ ------------------------------------------------------------------------------------
+ <BLANKLINE>
+ GLOBAL DIAGNOSTICS
+ <BLANKLINE>
+ REGIMES DIAGNOSTICS - CHOW TEST
+ VARIABLE DF VALUE PROB
+ CONSTANT 1 96.129 0.0000
+ PS90 1 4.554 0.0328
+ UE90 1 0.410 0.5220
+ Global test 3 680.960 0.0000
+ ================================ END OF REPORT =====================================
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ y,
+ x,
+ regimes,
+ w=None,
+ robust=None,
+ gwk=None,
+ slx_lags=0,
+ sig2n_k=True,
+ nonspat_diag=True,
+ spat_diag=False,
+ moran=False,
+ white_test=False,
+ vm=False,
+ constant_regi="many",
+ cols2regi="all",
+ regime_err_sep=True,
+ cores=False,
+ name_y=None,
+ name_x=None,
+ name_regimes=None,
+ name_w=None,
+ name_gwk=None,
+ name_ds=None,
+ latex=False
+ ):
+
+ n=USER.check_arrays(y,x)
+ y,name_y=USER.check_y(y,n,name_y)
+ USER.check_robust(robust,gwk)
+ ifrobust=="hac":
+ ifregime_err_sep:
+ set_warn(
+ self,
+ "Error by regimes is not available for HAC estimation. The error by regimes has been disabled for this model.",
+ )
+ regime_err_sep=False
+ spat_diag,warn=USER.check_spat_diag(spat_diag=spat_diag,w=w,robust=robust,slx_lags=slx_lags)
+ set_warn(self,warn)
+ ifrobustin["hac","white"]andwhite_test:
+ set_warn(
+ self,
+ "White test not available when standard errors are estimated by HAC or White correction.",
+ )
+ white_test=False
+
+ x_constant,name_x,warn=USER.check_constant(x,name_x,just_rem=True)
+ name_x=USER.set_name_x(name_x,x_constant,constant=True)
+ ifspat_diagormoran:
+ w=USER.check_weights(w,y,slx_lags=slx_lags,w_required=True,allow_wk=True)
+ else:
+ w=USER.check_weights(w,y,slx_lags=slx_lags,allow_wk=True)
+ ifslx_lags>0:
+ lag_x=get_lags(w,x_constant,slx_lags)
+ x_constant=np.hstack((x_constant,lag_x))
+ name_x+=USER.set_name_spatial_lags(name_x,slx_lags)
+
+ set_warn(self,warn)
+ self.slx_lags=slx_lags
+ self.name_x_r=USER.set_name_x(name_x,x_constant)
+ self.constant_regi=constant_regi
+ self.cols2regi=cols2regi
+ self.name_w=USER.set_name_w(name_w,w)
+ self.name_gwk=USER.set_name_w(name_gwk,gwk)
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.name_y=USER.set_name_y(name_y)
+
+ regimes,name_regimes=USER.check_reg_list(regimes,name_regimes,n)
+ self.name_regimes=USER.set_name_ds(name_regimes)
+ self.n=n
+ cols2regi=REGI.check_cols2regi(
+ constant_regi,cols2regi,x_constant,add_cons=False
+ )
+ self.regimes_set=REGI._get_regimes_set(regimes)
+ self.regimes=regimes
+ USER.check_regimes(self.regimes_set,self.n,x_constant.shape[1])
+ self.regime_err_sep=regime_err_sep
+ if(
+ regime_err_sep==True
+ andset(cols2regi)==set([True])
+ andconstant_regi=="many"
+ ):
+ self.y=y
+ regi_ids=dict(
+ (r,list(np.where(np.array(regimes)==r)[0]))forrinself.regimes_set
+ )
+ self._ols_regimes_multi(
+ x_constant,
+ w,
+ regi_ids,
+ cores,
+ gwk,
+ slx_lags,
+ sig2n_k,
+ robust,
+ nonspat_diag,
+ spat_diag,
+ vm,
+ name_x,
+ moran,
+ white_test,
+ latex
+ )
+ else:
+ x,self.name_x,x_rlist=REGI.Regimes_Frame.__init__(
+ self,x_constant,regimes,constant_regi,cols2regi,name_x,rlist=True
+ )
+
+ self.output=pd.DataFrame(self.name_x,
+ columns=['var_names'])
+ self.output['var_type']=['x']*len(self.name_x)
+ self.output['regime']=x_rlist
+ self.output['equation']=0
+
+ BaseOLS.__init__(self,y=y,x=x,robust=robust,gwk=gwk,sig2n_k=sig2n_k)
+ ifregime_err_sep==Trueandrobust==None:
+ y2,x2=REGI._get_weighted_var(
+ regimes,self.regimes_set,sig2n_k,self.u,y,x
+ )
+ ols2=BaseOLS(y=y2,x=x2,sig2n_k=sig2n_k)
+ RegressionProps_basic(self,betas=ols2.betas,vm=ols2.vm)
+ self.title=(
+ "ORDINARY LEAST SQUARES - REGIMES (Group-wise heteroskedasticity)"
+ )
+ ifslx_lags>0:
+ self.title="ORDINARY LEAST SQUARES WITH SLX - REGIMES (Group-wise heteroskedasticity)"
+ nonspat_diag=None
+ set_warn(
+ self,
+ "Residuals treated as homoskedastic for the purpose of diagnostics.",
+ )
+ else:
+ ifslx_lags==0:
+ self.title="ORDINARY LEAST SQUARES - REGIMES"
+ else:
+ self.title="ORDINARY LEAST SQUARES WITH SLX - REGIMES"
+ self.robust=USER.set_robust(robust)
+ self.chow=REGI.Chow(self)
+ self.other_top,self.other_mid,other_end=("","","")# strings where function-specific diag. are stored
+ ifnonspat_diag:
+ self.other_mid+=_nonspat_mid(self,white_test=white_test)
+ self.other_top+=_nonspat_top(self)
+ ifspat_diag:
+ other_end+=_spat_diag_out(self,w,'ols',moran=moran)#Must decide what to do with W.
+ output(reg=self,vm=vm,robust=robust,other_end=other_end,latex=latex)
+[docs]
+classOLS_Endog_Regimes(OLS_Regimes):
+"""
+ Ordinary least squares with endogenous regimes. Based on the function skater_reg as shown in :cite:`Anselin2021`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : pysal W object
+ Spatial weights object (required if running spatial
+ diagnostics)
+ n_clusters : int
+ Number of clusters to be used in the endogenous regimes.
+ If None (default), the number of clusters will be chosen
+ according to the function utils.optim_k using a method
+ adapted from Mojena (1977)'s Rule Two
+ quorum : int
+ Minimum number of observations in a cluster to be considered
+ Must be at least larger than the number of variables in x
+ Default value is 30 or 10*k, whichever is larger.
+ trace : boolean
+ Sets whether to store intermediate results of the clustering
+ Hard-coded to True if n_clusters is None
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ **kwargs : additional keyword arguments depending on the specific model
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ robust : string
+ Adjustment for robust standard errors
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ r2 : float
+ R squared
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ ar2 : float
+ Adjusted R squared
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2ML : float
+ Sigma squared (maximum likelihood)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ f_stat : tuple
+ Statistic (float), p-value (float)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ logll : float
+ Log likelihood
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ aic : float
+ Akaike information criterion
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ schwarz : float
+ Schwarz information criterion
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ t_stat : list of tuples
+ t statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mulColli : float
+ Multicollinearity condition number
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ jarque_bera : dictionary
+ 'jb': Jarque-Bera statistic (float); 'pvalue': p-value
+ (float); 'df': degrees of freedom (int)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ breusch_pagan : dictionary
+ 'bp': Breusch-Pagan statistic (float); 'pvalue': p-value
+ (float); 'df': degrees of freedom (int)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ koenker_bassett: dictionary
+ 'kb': Koenker-Bassett statistic (float); 'pvalue': p-value (float);
+ 'df': degrees of freedom (int). Only available in dictionary
+ 'multi' when multiple regressions (see 'multi' below for details).
+ white : dictionary
+ 'wh': White statistic (float); 'pvalue': p-value (float);
+ 'df': degrees of freedom (int).
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ lm_error : tuple
+ Lagrange multiplier test for spatial error model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ lm_lag : tuple
+ Lagrange multiplier test for spatial lag model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ rlm_error : tuple
+ Robust lagrange multiplier test for spatial error model;
+ tuple contains the pair (statistic, p-value), where each
+ is a float. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ rlm_lag : tuple
+ Robust lagrange multiplier test for spatial lag model;
+ tuple contains the pair (statistic, p-value), where each
+ is a float. Only available in dictionary 'multi' when
+ multiple regressions (see 'multi' below for details)
+ lm_sarma : tuple
+ Lagrange multiplier test for spatial SARMA model; tuple
+ contains the pair (statistic, p-value), where each is a
+ float. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ moran_res : tuple
+ Moran's I for the residuals; tuple containing the triple
+ (Moran's I, standardized Moran's I, p-value)
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ title : string
+ Name of the regression method used.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ xtx : float
+ :math:`X'X`. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ xtxi : float
+ :math:`(X'X)^{-1}`. Only available in dictionary 'multi' when multiple
+ regressions (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each observation to
+ a regime. Assumed to be aligned with 'x'.
+ constant_regi : string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime.
+ cols2regi : list
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate.
+ nr : int
+ Number of different regimes in the 'regimes' list.
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression.
+ SSR : list
+ list with the total sum of squared residuals for the model
+ considering all regimes for each of steps of number of regimes
+ considered, starting with the solution with 2 regimes.
+ clusters : int
+ Number of clusters considered in the endogenous regimes
+ _trace : list
+ List of dictionaries with the clustering results for each
+ number of clusters tested. Only available if n_clusters is
+ None or trace=True.
+
+ Examples
+ --------
+ >>> import libpysal
+ >>> import numpy as np
+ >>> np.set_printoptions(legacy='1.25') #to avoid printing issues with numpy floats
+ >>> import geopandas as gpd
+ >>> from spreg import OLS_Endog_Regimes
+
+ Open data on Baltimore house sales price and characteristics in Baltimore
+ from libpysal examples using geopandas.
+
+ >>> db = gpd.read_file(libpysal.examples.get_path('baltim.shp'))
+
+ We will create a weights matrix based on contiguity.
+
+ >>> w = libpysal.weights.Queen.from_dataframe(db, use_index=True)
+ >>> w.transform = "r"
+
+ For this example, we will use the 'PRICE' column as the dependent variable and
+ the 'NROOM', 'AGE', and 'SQFT' columns as independent variables.
+ At this point, we will let the model choose the number of clusters.
+
+ >>> olsr = OLS_Endog_Regimes(y=db['PRICE'], x=db[['NROOM','AGE','SQFT']], w=w, name_w="baltim_q.gal")
+
+ The function `print(olsr.summary)` can be used to visualize the results of the regression.
+
+ Alternatively, we can check individual attributes:
+ >>> olsr.betas
+ array([[26.24209866],
+ [ 2.40329959],
+ [-0.24183707],
+ [ 0.45714794],
+ [19.84817747],
+ [ 5.12117483],
+ [-0.65466516],
+ [ 1.10034154]])
+ >>> olsr.SSR
+ [68840.74965798721, 62741.55717492997]
+ >>> olsr.clusters
+ array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
+ 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int32)
+
+ We will now set the number of clusters to 2 and run the regression again.
+
+ >>> olsr = OLS_Endog_Regimes(y=db['PRICE'], x=db[['NROOM','AGE','SQFT']], w=w, n_clusters=2, name_w="baltim_q.gal")
+
+ The function `print(olsr.summary)` can be used to visualize the results of the regression.
+
+ Alternatively, we can check individual attributes as before:
+ >>> olsr.betas
+ array([[26.24209866],
+ [ 2.40329959],
+ [-0.24183707],
+ [ 0.45714794],
+ [19.84817747],
+ [ 5.12117483],
+ [-0.65466516],
+ [ 1.10034154]])
+ >>> olsr.SSR
+ [68840.74965798721]
+ >>> olsr.clusters
+ array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
+ 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int32)
+
+ """
+
+
+[docs]
+ def__init__(
+ self,y,x,w,n_clusters=None,quorum=-1,trace=True,name_y=None,name_x=None,**kwargs):
+
+ n=USER.check_arrays(y,x)
+ y,name_y=USER.check_y(y,n,name_y)
+ w=USER.check_weights(w,y,w_required=True)
+ x_constant,name_x,warn=USER.check_constant(x,name_x,just_rem=True)
+ set_warn(self,warn)
+ # Standardize the variables
+ x_std=(x_constant-np.mean(x_constant,axis=0))/np.std(x_constant,axis=0)
+
+ ifquorum<0:
+ quorum=np.max([(x_constant.shape[1]+1)*10,30])
+
+ ifnotn_clusters:
+ n_clusters_opt=x_constant.shape[0]*0.70//quorum
+ ifn_clusters_opt<2:
+ raiseValueError(
+ "The combination of the values of `N` and `quorum` is not compatible with regimes estimation.")
+ sk_reg_results=Skater_reg().fit(n_clusters_opt,w,x_std,{'reg':BaseOLS,'y':y,'x':x_constant},quorum=quorum,trace=True)
+ n_clusters=optim_k([sk_reg_results._trace[i][1][2]foriinrange(1,len(sk_reg_results._trace))])
+ self.clusters=sk_reg_results._trace[n_clusters-1][0]
+ else:
+ try:
+ # Call the Skater_reg method based on OLS
+ sk_reg_results=Skater_reg().fit(n_clusters,w,x_std,{'reg':BaseOLS,'y':y,'x':x_constant},quorum=quorum,trace=trace)
+ self.clusters=sk_reg_results.current_labels_
+ exceptExceptionase:
+ ifstr(e)=="one or more input arrays have more columns than rows":
+ raiseValueError("One or more input ended up with more variables than observations. Please check your setting for `quorum`.")
+ else:
+ print("An error occurred:",e)
+
+ self._trace=sk_reg_results._trace
+ self.SSR=[self._trace[i][1][2]foriinrange(1,len(self._trace))]
+
+ OLS_Regimes.__init__(self,y,x_constant,regimes=self.clusters,w=w,name_regimes='Skater_reg',name_y=name_y,name_x=name_x,**kwargs)
+"""
+Spatial Fixed Effects Panel model based on: :cite:`Elhorst2003`
+"""
+
+__author__="Wei Kang weikang9009@gmail.com, \
+ Pedro Amaral pedroamaral@cedeplar.ufmg.br, \
+ Pablo Estrada pabloestradace@gmail.com"
+
+importnumpyasnp
+importnumpy.linalgasla
+fromscipyimportsparseassp
+fromscipy.sparse.linalgimportspluasSuperLU
+from.utilsimportRegressionPropsY,RegressionPropsVM,inverse_prod,set_warn
+from.sputilsimportspdot,spfill_diagonal,spinv
+from.importdiagnosticsasDIAG
+from.importuser_outputasUSER
+from.importsummary_outputasSUMMARY
+
+try:
+ fromscipy.optimizeimportminimize_scalar
+
+ minimize_scalar_available=True
+exceptImportError:
+ minimize_scalar_available=False
+
+from.panel_utilsimportcheck_panel,demean_panel
+
+__all__=["Panel_FE_Lag","Panel_FE_Error"]
+
+
+classBasePanel_FE_Lag(RegressionPropsY,RegressionPropsVM):
+"""
+ Base ML method for a fixed effects spatial lag model (note no consistency
+ checks, diagnostics or constants added) :cite:`Elhorst2003`.
+
+ Parameters
+ ----------
+ y : array
+ (n*t)x1 array for dependent variable
+ x : array
+ Two dimensional array with n*t rows and one column for each
+ independent (exogenous) variable
+ (note: must already include constant term)
+ w : pysal W object
+ Spatial weights matrix
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and
+ inverse_product
+
+ Attributes
+ ----------
+ betas : array
+ (k+1)x1 array of estimated coefficients (rho last)
+ rho : float
+ estimate of spatial autoregressive coefficient
+ u : array
+ (nxt)x1 array of residuals
+ predy : array
+ (nxt)x1 array of predicted y values
+ n : integer
+ Total number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (no constant, excluding the rho)
+ y : array
+ (nxt)x1 array for dependent variable
+ x : array
+ Two dimensional array with nxt rows and one column for each
+ independent (exogenous) variable, no constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+1 x k+1)
+ vm1 : array
+ Variance covariance matrix (k+2 x k+2) includes sigma2
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ predy_e : array
+ predicted values from reduced form
+ e_pred : array
+ prediction errors using reduced form predicted values
+ """
+
+ def__init__(self,y,x,w,epsilon=0.0000001):
+ # set up main regression variables and spatial filters
+ self.n=w.n
+ self.t=y.shape[0]//self.n
+ self.k=x.shape[1]
+ self.epsilon=epsilon
+ # Demeaned variables
+ self.y=demean_panel(y,self.n,self.t)
+ self.x=demean_panel(x,self.n,self.t)
+ # Big W matrix
+ W=w.full()[0]
+ Wsp=w.sparse
+ Wsp_nt=sp.kron(sp.identity(self.t),Wsp,format="csr")
+ # lag dependent variable
+ ylag=spdot(Wsp_nt,self.y)
+ # b0, b1, e0 and e1
+ xtx=spdot(self.x.T,self.x)
+ xtxi=la.inv(xtx)
+ xty=spdot(self.x.T,self.y)
+ xtyl=spdot(self.x.T,ylag)
+ b0=spdot(xtxi,xty)
+ b1=spdot(xtxi,xtyl)
+ e0=self.y-spdot(self.x,b0)
+ e1=ylag-spdot(self.x,b1)
+
+ # concentrated Log Likelihood
+ I=sp.identity(self.n)
+ res=minimize_scalar(
+ lag_c_loglik_sp,
+ 0.0,
+ bounds=(-1.0,1.0),
+ args=(self.n,self.t,e0,e1,I,Wsp),
+ method="bounded",
+ options={"xatol":epsilon},
+ )
+ self.rho=res.x[0][0]
+
+ # compute full log-likelihood, including constants
+ ln2pi=np.log(2.0*np.pi)
+ llik=-res.fun-(self.n*self.t)/2.0*ln2pi-(self.n*self.t)/2.0
+ self.logll=llik[0][0]
+
+ # b, residuals and predicted values
+ b=b0-self.rho*b1
+ self.betas=np.vstack((b,self.rho))# rho added as last coefficient
+ self.u=e0-self.rho*e1
+ self.predy=self.y-self.u
+
+ xb=spdot(self.x,b)
+
+ self.predy_e=inverse_prod(
+ Wsp_nt,xb,self.rho,inv_method="power_exp",threshold=epsilon
+ )
+ self.e_pred=self.y-self.predy_e
+
+ # residual variance
+ self._cache={}
+ self.sig2=spdot(self.u.T,self.u)/(self.n*self.t)
+
+ # information matrix
+ a=-self.rho*W
+ spfill_diagonal(a,1.0)
+ ai=spinv(a)
+ wai=spdot(Wsp,ai)
+ tr1=wai.diagonal().sum()# same for sparse and dense
+
+ wai2=spdot(wai,wai)
+ tr2=wai2.diagonal().sum()
+
+ waiTwai=spdot(wai.T,wai)
+ tr3=waiTwai.diagonal().sum()
+
+ wai_nt=sp.kron(sp.identity(self.t),wai,format="csr")
+ wpredy=spdot(wai_nt,xb)
+ xTwpy=spdot(x.T,wpredy)
+
+ waiTwai_nt=sp.kron(sp.identity(self.t),waiTwai,format="csr")
+ wTwpredy=spdot(waiTwai_nt,xb)
+ wpyTwpy=spdot(xb.T,wTwpredy)
+
+ # order of variables is beta, rho, sigma2
+ v1=np.vstack((xtx/self.sig2,xTwpy.T/self.sig2,np.zeros((1,self.k))))
+ v2=np.vstack(
+ (
+ xTwpy/self.sig2,
+ self.t*(tr2+tr3)+wpyTwpy/self.sig2,
+ self.t*tr1/self.sig2,
+ )
+ )
+ v3=np.vstack(
+ (
+ np.zeros((self.k,1)),
+ self.t*tr1/self.sig2,
+ self.n*self.t/(2.0*self.sig2**2),
+ )
+ )
+
+ v=np.hstack((v1,v2,v3))
+
+ self.vm1=la.inv(v)# vm1 includes variance for sigma2
+ self.vm=self.vm1[:-1,:-1]# vm is for coefficients only
+ self.varb=la.inv(np.hstack((v1[:-1],v2[:-1])))
+ self.n=self.n*self.t# change the n, for degree of freedom
+
+
+
+[docs]
+classPanel_FE_Lag(BasePanel_FE_Lag):
+"""
+ ML estimation of the fixed effects spatial lag model with all results and
+ diagnostics :cite:`Elhorst2003`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas object
+ nxt or (nxt)x1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ nx(txk) or (nxt)xk array for independent (exogenous)
+ variables, no constant
+ w : pysal W object
+ Spatial weights object
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and
+ inverse_product
+ vm : boolean
+ if True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+
+ Attributes
+ ----------
+ betas : array
+ (k+1)x1 array of estimated coefficients (rho last)
+ rho : float
+ estimate of spatial autoregressive coefficient
+ u : array
+ (nxt)x1 array of residuals
+ predy : array
+ (nxt)x1 array of predicted y values
+ n : integer
+ Total number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (no constant, excluding the rho)
+ y : array
+ (nxt)x1 array for dependent variable
+ x : array
+ Two dimensional array with nxt rows and one column for each
+ independent (exogenous) variable, no constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+1 x k+1), all coefficients
+ vm1 : array
+ Variance covariance matrix (k+2 x k+2), includes sig2
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ aic : float
+ Akaike information criterion
+ schwarz : float
+ Schwarz criterion
+ predy_e : array
+ predicted values from reduced form
+ e_pred : array
+ prediction errors using reduced form predicted values
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ utu : float
+ Sum of squared residuals
+ std_err : array
+ 1x(k+1) array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> nat = libpysal.examples.load_example("NCOVR")
+ >>> db = libpysal.io.open(nat.get_path("NAT.dbf"), "r")
+ >>> nat_shp = libpysal.examples.get_path("NAT.shp")
+ >>> w = libpysal.weights.Queen.from_shapefile(nat_shp)
+ >>> w.transform = 'r'
+ >>> name_y = ["HR70", "HR80", "HR90"]
+ >>> y = np.array([db.by_col(name) for name in name_y]).T
+ >>> name_x = ["RD70", "RD80", "RD90", "PS70", "PS80", "PS90"]
+ >>> x = np.array([db.by_col(name) for name in name_x]).T
+ >>> fe_lag = spreg.Panel_FE_Lag(y, x, w, name_y=name_y, name_x=name_x, name_ds="NAT")
+ Warning: Assuming panel is in wide format, i.e. y[:, 0] refers to T0, y[:, 1] refers to T1, etc.
+ Similarly, assuming x[:, 0:T] refers to T periods of k1, x[:, T+1:2T] refers to k2, etc.
+ >>> np.around(fe_lag.betas, decimals=4)
+ array([[ 0.8006],
+ [-2.6004],
+ [ 0.1903]])
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ y,
+ x,
+ w,
+ epsilon=0.0000001,
+ vm=False,
+ name_y=None,
+ name_x=None,
+ name_w=None,
+ name_ds=None,
+ ):
+ n_rows=USER.check_arrays(y,x)
+ x_constant,name_x,warn=USER.check_constant(x,name_x,True)
+ set_warn(self,warn)
+ bigy,bigx,name_y,name_x,warn=check_panel(y,x_constant,w,name_y,name_x)
+ set_warn(self,warn)
+ w=USER.check_weights(w,bigy,w_required=True,time=True)
+
+ BasePanel_FE_Lag.__init__(self,bigy,bigx,w,epsilon=epsilon)
+ # increase by 1 to have correct aic and sc, include rho in count
+ self.k+=1
+ self.title="MAXIMUM LIKELIHOOD SPATIAL LAG PANEL"+" - FIXED EFFECTS"
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.name_y=USER.set_name_y(name_y)
+ self.name_x=USER.set_name_x(name_x,bigx,constant=True)
+ name_ylag=USER.set_name_yend_sp(self.name_y)
+ self.name_x.append(name_ylag)# rho changed to last position
+ self.name_w=USER.set_name_w(name_w,w)
+ self.aic=DIAG.akaike(reg=self)
+ self.schwarz=DIAG.schwarz(reg=self)
+ SUMMARY.Panel_FE_Lag(reg=self,w=w,vm=vm)
+
+
+
+
+classBasePanel_FE_Error(RegressionPropsY,RegressionPropsVM):
+"""
+ Base ML method for a fixed effects spatial error model (note no consistency
+ checks, diagnostics or constants added) :cite:`Elhorst2003`.
+
+ Parameters
+ ----------
+ y : array
+ (n*t)x1 array for dependent variable
+ x : array
+ Two dimensional array with n*t rows and one column for each
+ independent (exogenous) variable
+ (note: must already include constant term)
+ w : pysal W object
+ Spatial weights matrix
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and
+ inverse_product
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ lam : float
+ estimate of spatial autoregressive coefficient
+ u : array
+ (nxt)x1 array of residuals
+ predy : array
+ (nxt)x1 array of predicted y values
+ n : integer
+ Total number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (no constant, excluding the lambda)
+ y : array
+ (nxt)x1 array for dependent variable
+ x : array
+ Two dimensional array with nxt rows and one column for each
+ independent (exogenous) variable, no constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+1 x k+1)
+ vm1 : array
+ Variance covariance matrix (k+2 x k+2) includes sigma2
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ """
+
+ def__init__(self,y,x,w,epsilon=0.0000001):
+ # set up main regression variables and spatial filters
+ self.n=w.n
+ self.t=y.shape[0]//self.n
+ self.k=x.shape[1]
+ self.epsilon=epsilon
+ # Demeaned variables
+ self.y=demean_panel(y,self.n,self.t)
+ self.x=demean_panel(x,self.n,self.t)
+ # Big W matrix
+ W=w.full()[0]
+ Wsp=w.sparse
+ Wsp_nt=sp.kron(sp.identity(self.t),Wsp,format="csr")
+ # lag dependent variable
+ ylag=spdot(Wsp_nt,self.y)
+ xlag=spdot(Wsp_nt,self.x)
+
+ # concentrated Log Likelihood
+ I=sp.identity(self.n)
+ res=minimize_scalar(
+ err_c_loglik_sp,
+ 0.0,
+ bounds=(-1.0,1.0),
+ args=(self.n,self.t,self.y,ylag,self.x,xlag,I,Wsp),
+ method="bounded",
+ options={"xatol":epsilon},
+ )
+ self.lam=res.x
+
+ # compute full log-likelihood
+ ln2pi=np.log(2.0*np.pi)
+ self.logll=(
+ -res.fun-(self.n*self.t)/2.0*ln2pi-(self.n*self.t)/2.0
+ )
+
+ # b, residuals and predicted values
+ ys=self.y-self.lam*ylag
+ xs=self.x-self.lam*xlag
+ xsxs=spdot(xs.T,xs)
+ xsxsi=la.inv(xsxs)
+ xsys=spdot(xs.T,ys)
+ b=spdot(xsxsi,xsys)
+
+ self.betas=np.vstack((b,self.lam))
+
+ self.u=self.y-spdot(self.x,b)
+ self.predy=self.y-self.u
+
+ # residual variance
+ self.e_filtered=self.u-self.lam*spdot(Wsp_nt,self.u)
+ self.sig2=spdot(self.e_filtered.T,self.e_filtered)/(self.n*self.t)
+
+ # variance-covariance matrix betas
+ varb=self.sig2*xsxsi
+
+ # variance-covariance matrix lambda, sigma
+ a=-self.lam*W
+ spfill_diagonal(a,1.0)
+ ai=spinv(a)
+ wai=spdot(Wsp,ai)
+ tr1=wai.diagonal().sum()
+
+ wai2=spdot(wai,wai)
+ tr2=wai2.diagonal().sum()
+
+ waiTwai=spdot(wai.T,wai)
+ tr3=waiTwai.diagonal().sum()
+
+ v1=np.vstack((self.t*(tr2+tr3),self.t*tr1/self.sig2))
+ v2=np.vstack(
+ (self.t*tr1/self.sig2,self.t*self.n/(2.0*self.sig2**2))
+ )
+
+ v=np.hstack((v1,v2))
+
+ self.vm1=la.inv(v)
+
+ # create variance matrix for beta, lambda
+ vv=np.hstack((varb,np.zeros((self.k,1))))
+ vv1=np.hstack((np.zeros((1,self.k)),self.vm1[0,0]*np.ones((1,1))))
+
+ self.vm=np.vstack((vv,vv1))
+ self.varb=varb
+ self.n=self.n*self.t
+
+
+
+[docs]
+classPanel_FE_Error(BasePanel_FE_Error):
+"""
+ ML estimation of the fixed effects spatial error model with all results and
+ diagnostics :cite:`Elhorst2003`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas object
+ nxt or (nxt)x1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ nx(txk) or (nxt)xk array for independent (exogenous)
+ variables, no constant
+ w : pysal W object
+ Spatial weights object
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and
+ inverse_product
+ vm : boolean
+ if True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ lam : float
+ estimate of spatial autoregressive coefficient
+ u : array
+ (nxt)x1 array of residuals
+ e_filtered : array
+ (nxt)x1 array of spatially filtered residuals
+ predy : array
+ (nxt)x1 array of predicted y values
+ n : integer
+ Total number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (no constant, excluding the lambda)
+ y : array
+ (nxt)x1 array for dependent variable
+ x : array
+ Two dimensional array with nxt rows and one column for each
+ independent (exogenous) variable, including the constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+1 x k+1), all coefficients
+ vm1 : array
+ Variance covariance matrix (k+2 x k+2), includes sig2
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ aic : float
+ Akaike information criterion
+ schwarz : float
+ Schwarz criterion
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ utu : float
+ Sum of squared residuals
+ std_err : array
+ 1x(k+1) array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> nat = libpysal.examples.load_example("NCOVR")
+ >>> db = libpysal.io.open(nat.get_path("NAT.dbf"), "r")
+ >>> nat_shp = libpysal.examples.get_path("NAT.shp")
+ >>> w = libpysal.weights.Queen.from_shapefile(nat_shp)
+ >>> w.transform = 'r'
+ >>> name_y = ["HR70", "HR80", "HR90"]
+ >>> y = np.array([db.by_col(name) for name in name_y]).T
+ >>> name_x = ["RD70", "RD80", "RD90", "PS70", "PS80", "PS90"]
+ >>> x = np.array([db.by_col(name) for name in name_x]).T
+ >>> fe_error = spreg.Panel_FE_Error(y, x, w, name_y=name_y, name_x=name_x, name_ds="NAT")
+ Warning: Assuming panel is in wide format, i.e. y[:, 0] refers to T0, y[:, 1] refers to T1, etc.
+ Similarly, assuming x[:, 0:T] refers to T periods of k1, x[:, T+1:2T] refers to k2, etc.
+ >>> np.around(fe_error.betas, decimals=4)
+ array([[ 0.8698],
+ [-2.9661],
+ [ 0.1943]])
+ """
+
+
+"""
+Spatial Random Effects Panel model based on: :cite:`Elhorst2003`
+"""
+
+__author__="Wei Kang weikang9009@gmail.com, \
+ Pedro Amaral pedroamaral@cedeplar.ufmg.br, \
+ Pablo Estrada pabloestradace@gmail.com"
+
+importnumpyasnp
+importnumpy.linalgasla
+fromscipyimportsparseassp
+fromscipy.sparse.linalgimportspluasSuperLU
+from.utilsimportRegressionPropsY,RegressionPropsVM,inverse_prod,set_warn
+from.sputilsimportspdot,spfill_diagonal,spinv
+fromspreg.w_utilsimportsymmetrize
+from.importdiagnosticsasDIAG
+from.importuser_outputasUSER
+from.importsummary_outputasSUMMARY
+
+try:
+ fromscipy.optimizeimportminimize_scalar
+
+ minimize_scalar_available=True
+exceptImportError:
+ minimize_scalar_available=False
+try:
+ fromscipy.optimizeimportminimize
+
+ minimize_available=True
+exceptImportError:
+ minimize_available=False
+
+from.panel_utilsimportcheck_panel,demean_panel
+
+__all__=["Panel_RE_Lag","Panel_RE_Error"]
+
+
+classBasePanel_RE_Lag(RegressionPropsY,RegressionPropsVM):
+"""
+ Base ML method for a random effects spatial lag model (note no consistency
+ checks, diagnostics or constants added) :cite:`Elhorst2003`.
+
+ Parameters
+ ----------
+ y : array
+ (n*t)x1 array for dependent variable
+ x : array
+ Two dimensional array with n*t rows and one column for each
+ independent (exogenous) variable
+ (note: must already include constant term)
+ w : pysal W object
+ Spatial weights matrix
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and
+ inverse_product
+
+ Attributes
+ ----------
+ betas : array
+ (k+2)x1 array of estimated coefficients (rho and phi last)
+ rho : float
+ estimate of spatial autoregressive coefficient
+ phi : float
+ estimate of weight attached to the cross-sectional component
+ phi^2 = sig2 / (t*sig2_u + sig2)
+ u : array
+ (nxt)x1 array of residuals
+ predy : array
+ (nxt)x1 array of predicted y values
+ n : integer
+ Total number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (no constant, excluding the rho and phi)
+ y : array
+ (nxt)x1 array for dependent variable
+ x : array
+ Two dimensional array with nxt rows and one column for each
+ independent (exogenous) variable, no constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+2 x k+2)
+ vm1 : array
+ Variance covariance matrix (k+3 x k+3) includes sigma2
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ predy_e : array
+ predicted values from reduced form
+ e_pred : array
+ prediction errors using reduced form predicted values
+ """
+
+ def__init__(self,bigy,bigx,w,epsilon=0.0000001):
+ # set up main regression variables and spatial filters
+ self.n=w.n
+ self.t=bigy.shape[0]//self.n
+ self.k=bigx.shape[1]
+ self.epsilon=epsilon
+ # Big W matrix
+ W=w.full()[0]
+ Wsp=w.sparse
+ Wsp_nt=sp.kron(sp.identity(self.t),Wsp,format="csr")
+ # Set up parameters
+ converge=1
+ criteria=0.0000001
+ i=0
+ itermax=100
+ self.rho=0.1
+ self.phi=0.1
+ I=sp.identity(self.n)
+ xtx=spdot(bigx.T,bigx)
+ xtxi=la.inv(xtx)
+ xty=spdot(bigx.T,bigy)
+ b=spdot(xtxi,xty)
+
+ # Iterative procedure
+ whileconverge>criteriaandi<itermax:
+ phiold=self.phi
+ res_phi=minimize_scalar(
+ phi_c_loglik,
+ 0.1,
+ bounds=(0.0,1.0),
+ args=(self.rho,b,bigy,bigx,self.n,self.t,Wsp_nt),
+ method="bounded",
+ options={"xatol":epsilon},
+ )
+ self.phi=res_phi.x[0][0]
+ # Demeaned variables
+ self.y=demean_panel(bigy,self.n,self.t,phi=self.phi)
+ self.x=demean_panel(bigx,self.n,self.t,phi=self.phi)
+ # lag dependent variable
+ ylag=spdot(Wsp_nt,self.y)
+ # b0, b1, e0 and e1
+ xtx=spdot(self.x.T,self.x)
+ xtxi=la.inv(xtx)
+ xty=spdot(self.x.T,self.y)
+ xtyl=spdot(self.x.T,ylag)
+ b0=spdot(xtxi,xty)
+ b1=spdot(xtxi,xtyl)
+ e0=self.y-spdot(self.x,b0)
+ e1=ylag-spdot(self.x,b1)
+ res_rho=minimize_scalar(
+ lag_c_loglik_sp,
+ 0.0,
+ bounds=(-1.0,1.0),
+ args=(self.n,self.t,e0,e1,I,Wsp),
+ method="bounded",
+ options={"xatol":epsilon},
+ )
+ self.rho=res_rho.x[0][0]
+ b=b0-self.rho*b1
+ i+=1
+ converge=np.abs(phiold-self.phi)
+
+ # compute full log-likelihood, including constants
+ ln2pi=np.log(2.0*np.pi)
+ llik=-res_rho.fun-(self.n*self.t)/2.0*ln2pi-(self.n*self.t)/2.0
+ self.logll=llik[0][0]
+
+ # b, residuals and predicted values
+ self.betas=np.vstack((b,self.rho,self.phi))
+ self.u=e0-self.rho*e1
+ self.predy=self.y-self.u
+ xb=spdot(self.x,b)
+
+ self.predy_e=inverse_prod(
+ Wsp_nt,xb,self.rho,inv_method="power_exp",threshold=epsilon
+ )
+ self.e_pred=self.y-self.predy_e
+
+ # residual variance
+ self._cache={}
+ self.sig2=spdot(self.u.T,self.u)/(self.n*self.t)
+
+ # information matrix
+ a=-self.rho*W
+ spfill_diagonal(a,1.0)
+ ai=spinv(a)
+ wai=spdot(W,ai)
+ tr1=wai.diagonal().sum()# same for sparse and dense
+
+ wai2=spdot(wai,wai)
+ tr2=wai2.diagonal().sum()
+
+ waiTwai=spdot(wai.T,wai)
+ tr3=waiTwai.diagonal().sum()
+
+ wai_nt=sp.kron(sp.identity(self.t),wai,format="csr")
+ wpredy=spdot(wai_nt,xb)
+ xTwpy=spdot(self.x.T,wpredy)
+
+ waiTwai_nt=sp.kron(sp.identity(self.t),waiTwai,format="csr")
+ wTwpredy=spdot(waiTwai_nt,xb)
+ wpyTwpy=spdot(xb.T,wTwpredy)
+
+ # order of variables is beta, rho, sigma2
+ v1=np.vstack((xtx/self.sig2,xTwpy.T/self.sig2,np.zeros((2,self.k))))
+ v2=np.vstack(
+ (
+ xTwpy/self.sig2,
+ self.t*(tr2+tr3)+wpyTwpy/self.sig2,
+ -tr1/self.sig2,
+ self.t*tr1/self.sig2,
+ )
+ )
+ v3=np.vstack(
+ (
+ np.zeros((self.k,1)),
+ -tr1/self.sig2,
+ self.n*(1+1/self.phi**2),
+ -self.n/self.sig2,
+ )
+ )
+ v4=np.vstack(
+ (
+ np.zeros((self.k,1)),
+ self.t*tr1/self.sig2,
+ -self.n/self.sig2**2,
+ self.n*self.t/(2.0*self.sig2**2),
+ )
+ )
+
+ v=np.hstack((v1,v2,v3,v4))
+
+ self.vm1=la.inv(v)# vm1 includes variance for sigma2
+ self.vm=self.vm1[:-1,:-1]# vm is for coefficients and phi
+ self.varb=la.inv(np.hstack((v1[:-2],v2[:-2])))
+ self.n=self.n*self.t
+
+
+
+[docs]
+classPanel_RE_Lag(BasePanel_RE_Lag):
+"""
+ ML estimation of the random effects spatial lag model with all results and
+ diagnostics :cite:`Elhorst2003`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas object
+ nxt or (nxt)x1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ nx(txk) or (nxt)xk array for independent (exogenous)
+ variables, excluding the constant
+ w : pysal W object
+ Spatial weights object
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and
+ inverse_product
+ vm : boolean
+ if True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+
+ Attributes
+ ----------
+ betas : array
+ (k+2)x1 array of estimated coefficients (rho and phi last)
+ rho : float
+ estimate of spatial autoregressive coefficient
+ phi : float
+ estimate of weight attached to the cross-sectional component
+ phi^2 = sig2 / (t*sig2_u + sig2)
+ u : array
+ (nxt)x1 array of residuals
+ predy : array
+ (nxt)x1 array of predicted y values
+ n : integer
+ Total number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (excluding the phi)
+ y : array
+ (nxt)x1 array for dependent variable
+ x : array
+ Two dimensional array with nxt rows and one column for each
+ independent (exogenous) variable
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+2 x k+2)
+ vm1 : array
+ Variance covariance matrix (k+3 x k+3) includes sigma2
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ predy_e : array
+ predicted values from reduced form
+ e_pred : array
+ prediction errors using reduced form predicted values
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ utu : float
+ Sum of squared residuals
+ std_err : array
+ 1x(k+1) array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import pandas as pd
+ >>> import libpysal
+ >>> import spreg
+ >>> nat = libpysal.examples.load_example("NCOVR")
+ >>> db = libpysal.io.open(nat.get_path("NAT.dbf"), "r")
+ >>> nat_shp = libpysal.examples.get_path("NAT.shp")
+ >>> w_full = libpysal.weights.Queen.from_shapefile(nat_shp)
+ >>> name_y = ["HR70", "HR80", "HR90"]
+ >>> y_full = np.array([db.by_col(name) for name in name_y]).T
+ >>> name_x = ["RD70", "RD80", "RD90", "PS70", "PS80", "PS90"]
+ >>> x_full = np.array([db.by_col(name) for name in name_x]).T
+ >>> name_c = ["STATE_NAME", "FIPSNO"]
+ >>> df_counties = pd.DataFrame([db.by_col(name) for name in name_c], index=name_c).T
+ >>> filter_states = ["Kansas", "Missouri", "Oklahoma", "Arkansas"]
+ >>> filter_counties = df_counties[df_counties["STATE_NAME"].isin(filter_states)]["FIPSNO"].values
+ >>> counties = np.array(db.by_col("FIPSNO"))
+ >>> subid = np.where(np.isin(counties, filter_counties))[0]
+ >>> w = w_subset(w_full, subid)
+ >>> w.transform = 'r'
+ >>> y = y_full[subid, ]
+ >>> x = x_full[subid, ]
+ >>> re_lag = spreg.Panel_RE_Lag(y, x, w, name_y=name_y, name_x=name_x, name_ds="NAT")
+ Warning: Assuming panel is in wide format, i.e. y[:, 0] refers to T0, y[:, 1] refers to T1, etc.
+ Similarly, assuming x[:, 0:T] refers to T periods of k1, x[:, T+1:2T] refers to k2, etc.
+ np.around(re_lag.betas, decimals=4)
+ array([[4.44421994],
+ [2.52821717],
+ [2.24768846],
+ [0.25846846],
+ [0.68426639]])
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ y,
+ x,
+ w,
+ epsilon=0.0000001,
+ vm=False,
+ name_y=None,
+ name_x=None,
+ name_w=None,
+ name_ds=None,
+ ):
+ n_rows=USER.check_arrays(y,x)
+ bigy,bigx,name_y,name_x,warn=check_panel(y,x,w,name_y,name_x)
+ set_warn(self,warn)
+ bigx,name_x,warn=USER.check_constant(bigx,name_x)
+ set_warn(self,warn)
+ w=USER.check_weights(w,bigy,w_required=True,time=True)
+
+ BasePanel_RE_Lag.__init__(self,bigy,bigx,w,epsilon=epsilon)
+ # increase by 1 to have correct aic and sc, include rho in count
+ self.k+=1
+ self.title="MAXIMUM LIKELIHOOD SPATIAL LAG PANEL"+" - RANDOM EFFECTS"
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.name_y=USER.set_name_y(name_y)
+ self.name_x=USER.set_name_x(name_x,bigx,constant=False)
+ name_ylag=USER.set_name_yend_sp(self.name_y)
+ self.name_x.append(name_ylag)# rho changed to last position
+ self.name_x.append("phi")# error variance parameter
+ self.name_w=USER.set_name_w(name_w,w)
+ self.aic=DIAG.akaike(reg=self)
+ self.schwarz=DIAG.schwarz(reg=self)
+ SUMMARY.Panel_FE_Lag(reg=self,w=w,vm=vm)
+
+
+
+
+classBasePanel_RE_Error(RegressionPropsY,RegressionPropsVM):
+"""
+ Base ML method for a random effects spatial error model (note no
+ consistency checks, diagnostics or constants added) :cite:`Elhorst2003`.
+
+ Parameters
+ ----------
+ y : array
+ (n*t)x1 array for dependent variable
+ x : array
+ Two dimensional array with n*t rows and one column for each
+ independent (exogenous) variable
+ (note: must already include constant term)
+ w : pysal W object
+ Spatial weights matrix
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and
+ inverse_product
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ lam : float
+ estimate of spatial autoregressive coefficient
+ sig2_u : float
+ Sigma squared for random effects
+ u : array
+ (nxt)x1 array of residuals
+ predy : array
+ (nxt)x1 array of predicted y values
+ n : integer
+ Total number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (no constant, excluding the lambda)
+ y : array
+ (nxt)x1 array for dependent variable
+ x : array
+ Two dimensional array with nxt rows and one column for each
+ independent (exogenous) variable, no constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+2 x k+2)
+ vm1 : array
+ Variance covariance matrix (k+3 x k+3) includes sigma2
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ """
+
+ def__init__(self,y,x,w,epsilon=0.0000001):
+ # set up main regression variables and spatial filters
+ self.n=w.n
+ self.t=y.shape[0]//self.n
+ self.k=x.shape[1]
+ self.epsilon=epsilon
+ # Demeaned variables
+ self.y=y
+ self.x=x
+ # Big W matrix
+ W=w.full()[0]
+ Wsp=w.sparse
+ Wsp_nt=sp.kron(sp.identity(self.t),Wsp,format="csr")
+ # lag dependent variable
+ ylag=spdot(Wsp_nt,self.y)
+ xlag=spdot(Wsp_nt,self.x)
+
+ # concentrated Log Likelihood
+ I=np.identity(self.n)
+ ifw.asymmetry(intrinsic=False)==[]:
+ ww=symmetrize(w)
+ WW=np.array(ww.todense())
+ evals,evecs=la.eigh(WW)
+ W=WW
+ else:# need dense here
+ evals,evecs=la.eig(W)
+ one=np.ones((self.t,1))
+ J=(1/self.t)*spdot(one,one.T)
+ Q=sp.kron(J,I,format="csr")
+ y_mean=spdot(Q,self.y)
+ x_mean=spdot(Q,self.x)
+ res=minimize(
+ err_c_loglik_ord,
+ (0.0,0.1),
+ bounds=((-1.0,1.0),(0.0,10000.0)),
+ method="L-BFGS-B",
+ args=(
+ evals,
+ evecs,
+ self.n,
+ self.t,
+ self.y,
+ self.x,
+ ylag,
+ xlag,
+ y_mean,
+ x_mean,
+ I,
+ Wsp,
+ ),
+ )
+ self.lam,self.phi=res.x
+
+ # compute full log-likelihood
+ ln2pi=np.log(2.0*np.pi)
+ self.logll=(
+ -res.fun-(self.n*self.t)/2.0*ln2pi-(self.n*self.t)/2.0
+ )
+
+ # b, residuals and predicted values
+ cvals=self.t*self.phi**2+1/(1-self.lam*evals)**2
+ P=spdot(np.diag(cvals**(-0.5)),evecs.T)
+ pr=P-(I-self.lam*W)
+ pr_nt=sp.kron(sp.identity(self.t),pr,format="csr")
+ yrand=self.y+spdot(pr_nt,y_mean)
+ xrand=self.x+spdot(pr_nt,x_mean)
+ ys=yrand-self.lam*ylag
+ xs=xrand-self.lam*xlag
+ xsxs=spdot(xs.T,xs)
+ xsxsi=la.inv(xsxs)
+ xsys=spdot(xs.T,ys)
+ b=spdot(xsxsi,xsys)
+
+ self.u=self.y-spdot(self.x,b)
+ self.predy=self.y-self.u
+
+ # residual variance
+ self.e_filtered=ys-spdot(xs,b)
+ self.sig2=spdot(self.e_filtered.T,self.e_filtered)/(self.n*self.t)
+
+ # variance-covariance matrix betas
+ varb=self.sig2*xsxsi
+ # variance of random effects
+ self.sig2_u=self.phi**2*self.sig2
+
+ self.betas=np.vstack((b,self.lam,self.sig2_u))
+
+ # variance-covariance matrix lambda, sigma
+ a=-self.lam*W
+ spfill_diagonal(a,1.0)
+ aTai=la.inv(spdot(a.T,a))
+ wa_aw=spdot(W.T,a)+spdot(a.T,W)
+ gamma=spdot(wa_aw,aTai)
+ vi=la.inv(self.t*self.phi*I+aTai)
+ sigma=spdot(vi,aTai)
+
+ tr1=gamma.diagonal().sum()
+ tr2=vi.diagonal().sum()
+ tr3=sigma.diagonal().sum()
+
+ sigma_gamma=spdot(sigma,gamma)
+ tr4=sigma_gamma.diagonal().sum()
+
+ sigma_vi=spdot(sigma,vi)
+ tr5=sigma_vi.diagonal().sum()
+
+ sigma_gamma_vi=spdot(sigma_gamma,vi)
+ tr6=sigma_gamma_vi.diagonal().sum()
+
+ sigma_gamma_sigma=spdot(sigma_gamma,sigma)
+ tr7=sigma_gamma_sigma.diagonal().sum()
+
+ v1=np.vstack(
+ (
+ (self.t-1)/2*tr1**2+1/2*tr4**2,
+ self.t/(2*self.sig2)*tr6,
+ (self.t-1)/(2*self.sig2)*tr1+1/(2*self.sig2)*tr7,
+ )
+ )
+ v2=np.vstack(
+ (
+ self.t/(2*self.sig2)*tr6,
+ self.t**2/(2.0*self.sig2**2)*tr2**2,
+ self.t/(2.0*self.sig2**2)*tr5,
+ )
+ )
+ v3=np.vstack(
+ (
+ (self.t-1)/(2*self.sig2)*tr1+1/(2*self.sig2)*tr7,
+ self.t/(2.0*self.sig2**2)*tr5,
+ 1/(2.0*self.sig2**2)*((self.t-1)*self.n+tr3**2),
+ )
+ )
+
+ v=np.hstack((v1,v2,v3))
+
+ vm1=np.linalg.inv(v)
+
+ # create variance matrix for beta, lambda
+ vv=np.hstack((varb,np.zeros((self.k,2))))
+ vv1=np.hstack((np.zeros((2,self.k)),vm1[:2,:2]))
+
+ self.vm=np.vstack((vv,vv1))
+ self.varb=varb
+ self.n=self.n*self.t
+
+
+
+[docs]
+classPanel_RE_Error(BasePanel_RE_Error):
+"""
+ ML estimation of the random effects spatial error model with all results and
+ diagnostics :cite:`Elhorst2003`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas object
+ nxt or (nxt)x1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ nx(txk) or (nxt)xk array for independent (exogenous)
+ variables, no constant
+ w : pysal W object
+ Spatial weights object
+ epsilon : float
+ tolerance criterion in mimimize_scalar function and
+ inverse_product
+ vm : boolean
+ if True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ lam : float
+ estimate of spatial autoregressive coefficient
+ sig2_u : float
+ Sigma squared for random effects
+ u : array
+ (nxt)x1 array of residuals
+ e_filtered : array
+ (nxt)x1 array of spatially filtered residuals
+ predy : array
+ (nxt)x1 array of predicted y values
+ n : integer
+ Total number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (no constant, excluding the lambda)
+ y : array
+ (nxt)x1 array for dependent variable
+ x : array
+ Two dimensional array with nxt rows and one column for each
+ independent (exogenous) variable, including the constant
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (k+2 x k+2), all coefficients
+ vm1 : array
+ Variance covariance matrix (k+3 x k+3), includes sig2
+ sig2 : float
+ Sigma squared used in computations
+ logll : float
+ maximized log-likelihood (including constant terms)
+ aic : float
+ Akaike information criterion
+ schwarz : float
+ Schwarz criterion
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ utu : float
+ Sum of squared residuals
+ std_err : array
+ 1x(k+1) array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> import pandas as pd
+ >>> import libpysal
+ >>> import spreg
+ >>> nat = libpysal.examples.load_example("NCOVR")
+ >>> db = libpysal.io.open(nat.get_path("NAT.dbf"), "r")
+ >>> nat_shp = libpysal.examples.get_path("NAT.shp")
+ >>> w_full = libpysal.weights.Queen.from_shapefile(nat_shp)
+ >>> name_y = ["HR70", "HR80", "HR90"]
+ >>> y_full = np.array([db.by_col(name) for name in name_y]).T
+ >>> name_x = ["RD70", "RD80", "RD90", "PS70", "PS80", "PS90"]
+ >>> x_full = np.array([db.by_col(name) for name in name_x]).T
+ >>> name_c = ["STATE_NAME", "FIPSNO"]
+ >>> df_counties = pd.DataFrame([db.by_col(name) for name in name_c], index=name_c).T
+ >>> filter_states = ["Kansas", "Missouri", "Oklahoma", "Arkansas"]
+ >>> filter_counties = df_counties[df_counties["STATE_NAME"].isin(filter_states)]["FIPSNO"].values
+ >>> counties = np.array(db.by_col("FIPSNO"))
+ >>> subid = np.where(np.isin(counties, filter_counties))[0]
+ >>> w = w_subset(w_full, subid)
+ >>> w.transform = 'r'
+ >>> y = y_full[subid, ]
+ >>> x = x_full[subid, ]
+ >>> re_error = spreg.Panel_RE_Error(y, x, w, name_y=name_y, name_x=name_x, name_ds="NAT")
+ Warning: Assuming panel is in wide format, i.e. y[:, 0] refers to T0, y[:, 1] refers to T1, etc.
+ Similarly, assuming x[:, 0:T] refers to T periods of k1, x[:, T+1:2T] refers to k2, etc.
+ >>> np.around(re_error.betas, decimals=4)
+ array([[5.87893756],
+ [3.23269025],
+ [2.62996804],
+ [0.34042682],
+ [4.9782446]])
+ """
+
+
+[docs]
+classProbit(BaseProbit):
+"""
+ Classic non-spatial Probit and spatial diagnostics. The class includes a
+ printout that formats all the results and tests in a nice format.
+
+ The diagnostics for spatial dependence currently implemented are:
+
+ * Pinkse Error :cite:`Pinkse2004`
+
+ * Kelejian and Prucha Moran's I :cite:`Kelejian2001`
+
+ * Pinkse & Slade Error :cite:`Pinkse1998`
+
+ Parameters
+ ----------
+
+ x : numpy.ndarray or pandas object
+ nxk array of independent variables (assumed to be aligned with y)
+ y : numpy.ndarray or pandas.Series
+ nx1 array of dependent binary variable
+ w : W
+ PySAL weights instance aligned with y
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX type.
+ optim : string
+ Optimization method.
+ Default: 'newton' (Newton-Raphson).
+ Alternatives: 'ncg' (Newton-CG), 'bfgs' (BFGS algorithm)
+ scalem : string
+ Method to calculate the scale of the marginal effects.
+ Default: 'phimean' (Mean of individual marginal effects)
+ Alternative: 'xmean' (Marginal effects at variables mean)
+ maxiter : int
+ Maximum number of iterations until optimizer stops
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+
+ Attributes
+ ----------
+
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ y : array
+ nx1 array of dependent variable
+ betas : array
+ kx1 array with estimated coefficients
+ predy : array
+ nx1 array of predicted y values
+ n : int
+ Number of observations
+ k : int
+ Number of variables
+ vm : array
+ Variance-covariance matrix (kxk)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ xmean : array
+ Mean of the independent variables (kx1)
+ predpc : float
+ Percent of y correctly predicted
+ logl : float
+ Log-Likelihhod of the estimation
+ scalem : string
+ Method to calculate the scale of the marginal effects.
+ scale : float
+ Scale of the marginal effects.
+ slopes : array
+ Marginal effects of the independent variables (k-1x1)
+ slopes_vm : array
+ Variance-covariance matrix of the slopes (k-1xk-1)
+ LR : tuple
+ Likelihood Ratio test of all coefficients = 0
+ (test statistics, p-value)
+ Pinkse_error: float
+ Lagrange Multiplier test against spatial error correlation.
+ Implemented as presented in :cite:`Pinkse2004`
+ KP_error : float
+ Moran's I type test against spatial error correlation.
+ Implemented as presented in :cite:`Kelejian2001`
+ PS_error : float
+ Lagrange Multiplier test against spatial error correlation.
+ Implemented as presented in :cite:`Pinkse1998`
+ warning : boolean
+ if True Maximum number of iterations exceeded or gradient
+ and/or function calls not changing.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``libpysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
+
+ Extract the CRIME column (crime) from the DBF file and make it the
+ dependent variable for the regression. Note that libpysal requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept. Since we want to run a probit model and for this
+ example we use the Columbus data, we also need to transform the continuous
+ CRIME variable into a binary variable. As in :cite:`McMillen1992`, we define
+ y = 1 if CRIME > 40.
+
+ >>> y = np.array([dbf.by_col('CRIME')]).T
+ >>> y = (y>40).astype(float)
+
+ Extract HOVAL (home values) and INC (income) vectors from the DBF to be used as
+ independent variables in the regression. Note that libpysal requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this class adds a vector of ones to the
+ independent variables passed in.
+
+ >>> names_to_extract = ['INC', 'HOVAL']
+ >>> x = np.array([dbf.by_col(name) for name in names_to_extract]).T
+
+ Since we want to the test the probit model for spatial dependence, we need to
+ specify the spatial weights matrix that includes the spatial configuration of
+ the observations into the error component of the model. To do that, we can open
+ an already existing gal file or create a new one. In this case, we will use
+ ``columbus.gal``, which contains contiguity relationships between the
+ observations in the Columbus dataset we are using throughout this example.
+ Note that, in order to read the file, not only to open it, we need to
+ append '.read()' at the end of the command.
+
+ >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read()
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. In libpysal, this
+ can be easily performed in the following way:
+
+ >>> w.transform='r'
+
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import Probit
+ >>> model = Probit(y, x, w=w, name_y='crime', name_x=['income','home value'], name_ds='columbus', name_w='columbus.gal')
+
+ Once we have run the model, we can explore a little bit the output. The
+ regression object we have created has many attributes so take your time to
+ discover them.
+
+ >>> np.around(model.betas, decimals=6)
+ array([[ 3.353811],
+ [-0.199653],
+ [-0.029514]])
+
+ >>> np.around(model.vm, decimals=6)
+ array([[ 0.852814, -0.043627, -0.008052],
+ [-0.043627, 0.004114, -0.000193],
+ [-0.008052, -0.000193, 0.00031 ]])
+
+ Since we have provided a spatial weigths matrix, the diagnostics for
+ spatial dependence have also been computed. We can access them and their
+ p-values individually:
+
+ >>> tests = np.array([['Pinkse_error','KP_error','PS_error']])
+ >>> stats = np.array([[model.Pinkse_error[0],model.KP_error[0],model.PS_error[0]]])
+ >>> pvalue = np.array([[model.Pinkse_error[1],model.KP_error[1],model.PS_error[1]]])
+ >>> print(np.hstack((tests.T,np.around(np.hstack((stats.T,pvalue.T)),6))))
+ [['Pinkse_error' '3.131719' '0.076783']
+ ['KP_error' '1.721312' '0.085194']
+ ['PS_error' '2.558166' '0.109726']]
+
+ Or we can easily obtain a full summary of all the results nicely formatted and
+ ready to be printed simply by typing 'print model.summary'
+
+ """
+
+
+"""Skater Regression classes."""
+
+__author__="Luc Anselin anselin@uchicago.edu, Pedro Amaral pedroamaral@cedeplar.ufmg.br, Levi Wolf levi.john.wolf@bristol.ac.uk"
+
+fromscipy.sparseimportcsgraphascg
+fromscipy.optimizeimportOptimizeWarning
+fromcollectionsimportnamedtuple
+fromwarningsimportwarn
+fromlibpysal.weightsimportw_subset
+from.utilsimportset_endog
+from.twosls_regimesimportTSLS_Regimes
+importtime
+importnumpyasnp
+importcopy
+
+try:
+ fromsklearn.metricsimporteuclidean_distances
+exceptImportError:
+ fromscipy.spatial.distanceimportpdist,cdist,squareform
+
+ defeuclidean_distances(X,Y=None):
+"""
+ fallback function to compute pairwise euclidean distances
+ for a single input, or point-to-point euclidean distances
+ for two inputs.
+ """
+ ifYisNone:
+ returnsquareform(pdist(X))
+ else:
+ returncdist(X,Y)
+
+
+__all__=["Skater_reg"]
+
+deletion=namedtuple("deletion",("in_node","out_node","score"))
+
+
+
+[docs]
+classSkater_reg(object):
+"""
+ Initialize the Skater_reg algorithm based on :cite:`Anselin2021`.
+ The function can currently estimate OLS, from
+ spreg or stats_models, and Spatial Lag models from spreg.
+ Fit method performs estimation and returns a Skater_reg object.
+
+ Parameters
+ ----------
+ dissimilarity : a callable distance metric.
+ Default: sklearn.metrics.pairwise.euclidean_distances
+ affinity : a callable affinity metric between 0,1.
+ Will be inverted to provide a
+ dissimilarity metric.
+ reduction : the reduction applied over all clusters
+ to provide the map score.
+ Default: numpy.sum
+ center : way to compute the center of each region in attribute space
+ Default: numpy.mean
+
+ NOTE: Optimization occurs with respect to a *dissimilarity* metric, so the reduction should
+ yield some kind of score where larger values are *less desirable* than smaller values.
+ Typically, this means we use addition.
+
+
+ Attributes
+ ----------
+ coords : array-like
+ n*2, collection of n sets of (x,y) coordinates used for
+ calibration locations
+ y : array
+ n*1, dependent variable
+ X : array
+ n*k, independent variable, not including the constant
+ bw : scalar
+ bandwidth value consisting of either a distance or N
+ nearest neighbors; user specified or obtained using
+ Sel_BW
+ family : family object
+ underlying probability model; provides
+ distribution-specific calculations
+ offset : array
+ n*1, the offset variable at the ith location. For Poisson model
+ this term is often the size of the population at risk or
+ the expected size of the outcome in spatial epidemiology
+ Default is None where Ni becomes 1.0 for all locations
+ sigma2_v1 : boolean
+ specify form of corrected denominator of sigma squared to use for
+ model diagnostics; Acceptable options are:
+ 'True': n-tr(S) (defualt)
+ 'False': n-2(tr(S)+tr(S'S))
+ kernel : string
+ type of kernel function used to weight observations;
+ available options:
+ 'gaussian'
+ 'bisquare'
+ 'exponential'
+ fixed : boolean
+ True for distance based kernel function and False for
+ adaptive (nearest neighbor) kernel function (default)
+ constant : boolean
+ True to include intercept (default) in model and False to exclude
+ intercept
+ spherical : boolean
+ True for shperical coordinates (long-lat),
+ False for projected coordinates (defalut).
+ hat_matrix : boolean
+ True to store full n by n hat matrix,
+ False to not store full hat matrix to minimize memory footprint (defalut).
+ n : integer
+ number of observations
+ k : integer
+ number of independent variables
+ mean_y : float
+ mean of y
+ std_y : float
+ standard deviation of y
+ fit_params : dict
+ parameters passed into fit method to define estimation
+ routine
+ points : array-like
+ n*2, collection of n sets of (x,y) coordinates used for
+ calibration locations instead of all observations;
+ defaults to None unles specified in predict method
+ P : array
+ n*k, independent variables used to make prediction;
+ exlcuding the constant; default to None unless specified
+ in predict method
+ exog_scale : scalar
+ estimated scale using sampled locations; defualt is None
+ unless specified in predict method
+ exog_resid : array-like
+ estimated residuals using sampled locations; defualt is None
+ unless specified in predict method
+ Examples
+ --------
+ >>> import libpysal as ps
+ >>> import numpy as np
+ >>> import spreg
+ >>> from spreg.skater_reg import Skater_reg
+ >>> data = ps.io.open(ps.examples.get_path('columbus.dbf'))
+ >>> y = np.array(data.by_col('HOVAL')).reshape((-1,1))
+ >>> x_var = ['INC','CRIME']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+ >>> w = ps.weights.Queen.from_shapefile(ps.examples.get_path("columbus.shp"))
+ >>> x_std = (x - np.mean(x,axis=0)) / np.std(x,axis=0)
+ >>> results = Skater_reg().fit(3, w, x_std, {'reg':spreg.OLS,'y':y,'x':x}, quorum=10, trace=False)
+ >>> results.current_labels_
+ array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
+ 0, 1, 0, 0, 2, 2, 0, 0, 1, 0, 2, 1, 2, 1, 2, 0, 1, 0, 0, 1, 2, 2,
+ 2, 1, 0, 2, 2], dtype=int32)
+ """
+
+
+[docs]
+ deffit(
+ self,
+ n_clusters,
+ W,
+ data=None,
+ data_reg=None,
+ quorum=-np.inf,
+ trace=True,
+ islands="increase",
+ verbose=False,
+ model_family="spreg",
+ ):
+"""
+ Method that fits a model with a particular estimation routine.
+
+ Parameters
+ ----------
+ n_clusters : int of clusters wanted
+ W : pysal W object expressing the neighbor relationships between observations.
+ Should be symmetric and binary, so Queen/Rook, DistanceBand, or a symmetrized KNN.
+ data : np.ndarray of (N,P) shape with N observations and P features
+ This is the data that is used to evaluate the similarity between each observation.
+ data_reg : list containing:
+ 1- a callable regression method (ex. OLS or GM_Lag from spreg or OLS from statsmodels)
+ 2- np.ndarray of (N,1) shape with N observations on the depedent variable for the regression
+ 3- np.ndarray of (N,k) shape with N observations and k columns containing the explanatory variables (constant must not be included)
+ 4- pysal W object to be used in the regression (optional)
+ quorum : int with minimum size of each region.
+ trace : bool denoting whether to store intermediate
+ labelings as the tree gets pruned
+ islands : string describing what to do with islands.
+ If "ignore", will discover `n_clusters` regions, treating islands as their own regions.
+ If "increase", will discover `n_clusters` regions, treating islands as separate from n_clusters.
+ verbose : bool/int describing how much output to provide to the user,
+ in terms of print statements or progressbars.
+ model_family : string describing the fFamily of estimation method used for the regression.
+ Must be either 'spreg' (default) or 'statsmodels'
+
+ Returns
+ -------
+ : Skater_reg object.
+ """
+ iftrace:
+ self._trace=[]
+ ifdataisNone:
+ attribute_kernel=np.ones((W.n,W.n))
+ data=np.ones((W.n,1))
+ else:
+ attribute_kernel=self.metric(data)
+ W.transform="b"
+ W=W.sparse
+ start=time.time()
+
+ super_verbose=verbose>1
+ start_W=time.time()
+ dissim=W.multiply(attribute_kernel)
+ dissim.eliminate_zeros()
+ end_W=time.time()-start_W
+
+ ifsuper_verbose:
+ print("Computing Affinity Kernel took {:.2f}s".format(end_W))
+
+ tree_time=time.time()
+ MSF=cg.minimum_spanning_tree(dissim)
+ tree_time=time.time()-tree_time
+ ifsuper_verbose:
+ print("Computing initial MST took {:.2f}s".format(tree_time))
+
+ initial_component_time=time.time()
+ current_n_subtrees,current_labels=cg.connected_components(
+ MSF,directed=False
+ )
+ initial_component_time=time.time()-initial_component_time
+
+ ifsuper_verbose:
+ print(
+ "Computing connected components took {:.2f}s.".format(
+ initial_component_time
+ )
+ )
+
+ ifcurrent_n_subtrees>1:
+ island_warnings=[
+ "Increasing `n_clusters` from {} to {} in order to account for islands.".format(
+ n_clusters,n_clusters+current_n_subtrees
+ ),
+ "Counting islands towards the remaining {} clusters.".format(
+ n_clusters-(current_n_subtrees)
+ ),
+ ]
+ ignoring_islands=int(islands.lower()=="ignore")
+ chosen_warning=island_warnings[ignoring_islands]
+ warn(
+ "By default, the graph is disconnected! {}".format(chosen_warning),
+ OptimizeWarning,
+ stacklevel=2,
+ )
+ ifnotignoring_islands:
+ n_clusters+=current_n_subtrees
+ _,island_populations=np.unique(current_labels,return_counts=True)
+ if(island_populations<quorum).any():
+ raiseValueError(
+ "Islands must be larger than the quorum. If not, drop the small islands and solve for"
+ " clusters in the remaining field."
+ )
+ iftrace:
+ self._trace.append(([],deletion(np.nan,np.nan,np.inf)))
+ ifsuper_verbose:
+ print(self._trace[-1])
+ trees_scores=None
+ prev_score=np.inf
+ whilecurrent_n_subtrees<n_clusters:# while we don't have enough regions
+ (
+ best_deletion,
+ trees_scores,
+ new_MSF,
+ current_n_subtrees,
+ current_labels,
+ )=self.find_cut(
+ MSF,
+ data,
+ data_reg,
+ current_n_subtrees,
+ current_labels,
+ quorum=quorum,
+ trees_scores=trees_scores,
+ labels=None,
+ target_label=None,
+ verbose=verbose,
+ model_family=model_family,
+ )
+
+ ifnp.isfinite(best_deletion.score):# if our search succeeds
+ # accept the best move as *the* move
+ ifsuper_verbose:
+ print("cut made {}...".format(best_deletion))
+ ifbest_deletion.score>prev_score:
+ raiseValueError(
+ ("The score increased with the number of clusters. "
+ "Please check your data.\nquorum: {}; n_clusters: {}"
+ ).format(quorum,n_clusters)
+ )
+ prev_score=best_deletion.score
+ MSF=new_MSF
+ else:# otherwise, it means the MSF admits no further cuts
+ prev_n_subtrees,_=cg.connected_components(MSF,directed=False)
+ warn(
+ "MSF contains no valid moves after finding {} subtrees. "
+ "Decrease the size of your quorum to find the remaining {} subtrees.".format(
+ prev_n_subtrees,n_clusters-prev_n_subtrees
+ ),
+ OptimizeWarning,
+ stacklevel=2,
+ )
+ iftrace:
+ self._trace.append((current_labels,best_deletion))
+
+ self.current_labels_=current_labels
+ self.minimum_spanning_forest_=MSF
+ self._elapsed_time=time.time()-start
+ returnself
+
+
+
+[docs]
+ defscore_spreg(
+ self,
+ data=None,
+ data_reg=None,
+ all_labels=None,
+ quorum=-np.inf,
+ current_labels=None,
+ current_tree=None,
+ ):
+"""
+ This yields a score for the data using methods from the spreg library, given the labels provided. If no labels are provided,
+ and the object has been fit, then the labels discovered from the previous fit are used.
+
+ If a quorum is not passed, it is assumed to be irrelevant.
+
+ If a quorum is passed and the labels do not meet quorum, the score is inf.
+
+ data : (N,P) array of data on which to compute the score of the regions expressed in labels
+ data_reg : list containing:
+ 1- a callable spreg regression method (ex. OLS or GM_Lag)
+ 2- np.ndarray of (N,1) shape with N observations on the depedent variable for the regression
+ 3- np.ndarray of (N,k) shape with N observations and k columns containing the explanatory variables (constant must not be included)
+ 4- pysal W object to be used in the regression (optional)
+ all_labels : (N,) flat vector of labels expressing the classification of each observation into a region considering the cut under evaluation.
+ quorum : int expressing the minimum size of regions. Can be -inf if there is no lower bound.
+ Any region below quorum makes the score inf.
+ current_labels: (N,) flat vector of labels expressing the classification of each observation into a region not considering the cut under evaluation.
+
+ current_tree: integer indicating the tree's label currently being considered for division
+ """
+
+ labels,subtree_quorums=self._prep_score(
+ all_labels,current_tree,current_labels
+ )
+ if(subtree_quorums<quorum).any():
+ returnnp.inf,None
+ set_labels=set(labels)
+ ifdata_regisnotNone:
+ kargs={
+ k:v
+ fork,vindata_reg.items()
+ ifknotin["reg","y","x","w","x_nd"]
+ }
+ trees_scores={}
+
+ ifdata_reg["reg"].__name__=="GM_Lag"ordata_reg["reg"].__name__=="BaseGM_Lag":
+ try:
+ x=np.hstack((np.ones((data_reg["x"].shape[0],1)),data_reg["x"]))
+ reg=TSLS_Regimes(
+ y=data_reg["y"],
+ x=x,
+ yend=data_reg["yend"],
+ q=data_reg["q"],
+ regimes=all_labels,)
+ except:
+ x=_const_x(data_reg["x"])
+ reg=TSLS_Regimes(
+ y=data_reg["y"],
+ x=x,
+ yend=data_reg["yend"],
+ q=data_reg["q"],
+ regimes=all_labels,)
+ score=np.dot(reg.u.T,reg.u)[0][0]
+ else:
+
+ forlinset_labels:
+ x=data_reg["x"][all_labels==l]
+ ifnp.linalg.matrix_rank(x)<x.shape[1]:
+ small_diag_indices=np.abs(np.diag(np.linalg.qr(x)[1]))<1e-10
+ x=x[:,~small_diag_indices]
+
+ if"w"notindata_reg:
+ try:
+ x=np.hstack((np.ones((x.shape[0],1)),x))
+ reg=data_reg["reg"](
+ y=data_reg["y"][all_labels==l],x=x,**kargs
+ )
+ exceptnp.linalg.LinAlgError:
+ x=_const_x(x)
+ reg=data_reg["reg"](
+ y=data_reg["y"][all_labels==l],x=x,**kargs
+ )
+ else:
+ l_arrays=np.array(all_labels)
+
+ regi_ids=list(np.where(l_arrays==l)[0])
+ w_ids=list(map(data_reg["w"].id_order.__getitem__,regi_ids))
+ w_regi_i=w_subset(data_reg["w"],w_ids,silence_warnings=True)
+ try:
+ x=np.hstack((np.ones((x.shape[0],1)),x))
+ reg=data_reg["reg"](
+ y=data_reg["y"][all_labels==l],x=x,w=w_regi_i,**kargs
+ )
+ exceptnp.linalg.LinAlgError:
+ x=_const_x(x)
+ reg=data_reg["reg"](
+ y=data_reg["y"][all_labels==l],x=x,w=w_regi_i,**kargs
+ )
+ trees_scores[l]=np.dot(reg.u.T,reg.u)[0][0]
+ score=sum(trees_scores.values())
+ else:
+ part_scores,score,trees_scores=self._data_reg_none(
+ data,all_labels,l,set_labels
+ )
+
+ returnscore,trees_scores
+
+
+
+[docs]
+ defscore_stats(
+ self,
+ data=None,
+ data_reg=None,
+ all_labels=None,
+ quorum=-np.inf,
+ current_labels=None,
+ current_tree=None,
+ ):
+"""
+ This yields a score for the data using methods from the stats_models library, given the labels provided. If no labels are provided,
+ and the object has been fit, then the labels discovered from the previous fit are used.
+
+ If a quorum is not passed, it is assumed to be irrelevant.
+
+ If a quorum is passed and the labels do not meet quorum, the score is inf.
+
+ data : (N,P) array of data on which to compute the score of the regions expressed in labels
+ data_reg : list containing:
+ 1- a callable statsmodels regression method (ex. OLS)
+ 2- np.ndarray of (N,1) shape with N observations on the depedent variable for the regression
+ 3- np.ndarray of (N,k) shape with N observations and k columns containing the explanatory variables (constant must not be included)
+ 4- pysal W object to be used in the regression (optional)
+ all_labels : (N,) flat vector of labels expressing the classification of each observation into a region considering the cut under evaluation.
+ quorum : int expressing the minimum size of regions. Can be -inf if there is no lower bound.
+ Any region below quorum makes the score inf.
+ current_labels: (N,) flat vector of labels expressing the classification of each observation into a region not considering the cut under evaluation.
+
+ current_tree: integer indicating the tree label is currently being considered for division
+
+ NOTE: Optimization occurs with respect to a *dissimilarity* metric, so the problem *minimizes*
+ the map dissimilarity. So, lower scores are better.
+ """
+ labels,subtree_quorums=self._prep_score(
+ all_labels,current_tree,current_labels
+ )
+ if(subtree_quorums<quorum).any():
+ returnnp.inf,None
+ set_labels=set(labels)
+ ifdata_regisnotNone:
+ kargs={
+ k:v
+ fork,vindata_reg.items()
+ ifknotin["reg","y","x","w","x_nd"]
+ }
+ trees_scores={}
+ forlinset_labels:
+ x=data_reg["x"][all_labels==l]
+ ifnp.linalg.matrix_rank(x)<x.shape[1]:
+ small_diag_indices=np.abs(np.diag(np.linalg.qr(x)[1]))<1e-10
+ x=x[:,~small_diag_indices]
+
+ try:
+ x=np.hstack((np.ones((x.shape[0],1)),x))
+ reg=data_reg["reg"](
+ data_reg["y"][all_labels==l],x,**kargs
+ ).fit()
+ exceptnp.linalg.LinAlgError:
+ x=_const_x(x)
+ reg=data_reg["reg"](
+ data_reg["y"][all_labels==l],x,**kargs
+ ).fit()
+
+ trees_scores[l]=np.sum(reg.resid**2)
+ score=sum(trees_scores.values())
+ else:
+ part_scores,score,trees_scores=self._data_reg_none(
+ data,all_labels,l,set_labels
+ )
+ returnscore,trees_scores
+
+
+ def_prep_score(self,all_labels,current_tree,current_labels):
+ ifall_labelsisNone:
+ try:
+ labels=self.current_labels_
+ exceptAttributeError:
+ raiseValueError(
+ "Labels not provided and MSF_Prune object has not been fit to data yet."
+ )
+ ifcurrent_treeisnotNone:
+ labels=all_labels[current_labels==current_tree]
+ _,subtree_quorums=np.unique(labels,return_counts=True)
+ returnlabels,subtree_quorums
+
+ def_data_reg_none(self,data,all_labels,l,set_labels):
+ assertdata.shape[0]==len(
+ all_labels
+ ),"Length of label array ({}) does not match ""length of data ({})! ".format(
+ all_labels.shape[0],data.shape[0]
+ )
+ part_scores=[
+ self.reduction(
+ self.metric(
+ X=data[all_labels==l],
+ Y=self.center(data[all_labels==l],axis=0).reshape(1,-1),
+ )
+ )
+ forlinset_labels
+ ]
+
+ score=self.reduction(part_scores).item()
+ trees_scores={l:part_scores[i]fori,linenumerate(set_labels)}
+ returnpart_scores,score,trees_scores
+
+ def_prep_lag(self,data_reg):
+ # if the model is a spatial lag, add the lagged dependent variable to the model
+ data_reg['yend'],data_reg['q']=set_endog(data_reg["y"],data_reg["x"][:,1:],data_reg["w"],yend=None,
+ q=None,w_lags=1,lag_q=True)
+ returndata_reg
+
+
+[docs]
+ deffind_cut(
+ self,
+ MSF,
+ data=None,
+ data_reg=None,
+ current_n_subtrees=None,
+ current_labels=None,
+ quorum=-np.inf,
+ trees_scores=None,
+ labels=None,
+ target_label=None,
+ make=False,
+ verbose=False,
+ model_family="spreg",
+ ):
+"""
+ Find the best cut from the MSF.
+
+ MSF: (N,N) scipy sparse matrix with zero elements removed.
+ Represents the adjacency matrix for the minimum spanning forest.
+ Constructed from sparse.csgraph.sparse_from_dense or using MSF.eliminate_zeros().
+ You MUST remove zero entries for this to work, otherwise they are considered no-cost paths.
+ data: (N,p) attribute matrix. If not provided, replaced with (N,1) vector of ones.
+ data_reg: optional list containing:
+ 1- a callable spreg or statsmodels regression method (ex. OLS or GM_Lag)
+ 2- np.ndarray of (N,1) shape with N observations on the depedent variable for the regression
+ 3- np.ndarray of (N,k) shape with N observations and k columns containing the explanatory variables (constant must not be included)
+ 4- pysal W object to be used in the regression (optional)
+ current_n_subtrees: integer indication the current number of subtrees.
+ current_labels: (N,) flat vector of labels expressing the classification of each observation into a region not considering the cut under evaluation.
+ quorum: int denoting the minimum number of elements in the region
+ trees_scores: dictionary indicating subtress's labels and their respective current score.
+ labels: (N,) flat vector of labels for each point. Represents the "cluster labels"
+ for disconnected components of the graph.
+ target_label: int from the labels array to subset the MSF. If passed along with `labels`, then a cut
+ will be found that is restricted to that subset of the MSF.
+ make: bool, whether or not to modify the input MSF in order to make the best cut that was found.
+ verbose: bool/int, denoting how much output to provide to the user, in terms
+ of print statements or progressbars
+
+ Returns a namedtuple with in_node, out_node, and score.
+ """
+ ifdataisNone:
+ data=np.ones(MSF.shape)
+
+ if(labelsisNone)!=(target_labelisNone):
+ raiseValueError(
+ "Both labels and target_label must be supplied! Only {} provided.".format(
+ ["labels","target_label"][int(target_labelisNone)]
+ )
+ )
+ ifverbose:
+ try:
+ fromtqdmimporttqdm
+ exceptImportError:
+
+ deftqdm(noop,desc=""):
+ returnnoop
+
+ else:
+
+ deftqdm(noop,desc=""):
+ returnnoop
+
+ zero_in=(labelsisnotNone)and(target_labelisnotNone)
+ best_deletion=deletion(np.nan,np.nan,np.inf)
+ best_d_score=-np.inf
+
+ try:
+ ifdata_reg["reg"].__name__=="GM_Lag"ordata_reg["reg"].__name__=="BaseGM_Lag":
+ data_reg=self._prep_lag(data_reg)
+ except:
+ pass
+
+ try:
+ old_score=sum(trees_scores.values())
+ except:
+ pass
+ best_scores={}
+ current_list=current_labels.tolist()
+ forin_node,out_nodeintqdm(
+ np.vstack(MSF.nonzero()).T,desc="finding cut..."
+ ):# iterate over MSF edges
+ ifzero_in:
+ iflabels[in_node]!=target_label:
+ continue
+
+ local_MSF=copy.deepcopy(MSF)
+ # delete a candidate edge
+ local_MSF[in_node,out_node]=0
+ local_MSF.eliminate_zeros()
+ current_tree=current_labels[in_node]
+
+ # get the connected components
+ local_n_subtrees,local_labels=cg.connected_components(
+ local_MSF,directed=False
+ )
+
+ iflocal_n_subtrees<=current_n_subtrees:
+ raiseException("Malformed MSF!")
+
+ # compute the score of these components
+ ifmodel_family=="spreg":
+ new_score,new_trees_scores=self.score_spreg(
+ data,data_reg,local_labels,quorum,current_labels,current_tree
+ )
+ elifmodel_family=="statsmodels":
+ new_score,new_trees_scores=self.score_stats(
+ data,data_reg,local_labels,quorum,current_labels,current_tree
+ )
+ else:
+ raiseValueError("Model family must be either spreg or statsmodels.")
+
+ ifnp.isfinite(new_score):
+ try:
+ d_score=trees_scores[current_tree]-new_score
+ score=old_score-d_score
+ except:
+ d_score=-new_score
+ score=new_score
+ # if the d_score is greater than the best score and quorum is met
+ ifd_score>best_d_score:
+ best_deletion=deletion(in_node,out_node,score)
+ best_d_score=d_score
+ try:
+ foriinset(current_labels):
+ best_scores[
+ local_labels[current_list.index(i)]
+ ]=trees_scores[i]
+ foriinnew_trees_scores:
+ best_scores[i]=new_trees_scores[i]
+ except:
+ best_scores=new_trees_scores
+ best_MSF=local_MSF
+ best_labels=local_labels
+ try:
+ returnbest_deletion,best_scores,best_MSF,local_n_subtrees,best_labels
+ exceptUnboundLocalError:# in case no solution is found
+ returndeletion(None,None,np.inf),np.inf,None,np.inf,None
+"""
+Spatial random effects panel model based on: :cite:`KKP2007`
+"""
+
+__author__=(
+ "Luc Anselin anselin@uchicago.edu, Pedro Amaral pedroamaral@cedeplar.ufmg.br"
+)
+
+fromscipyimportsparseasSP
+importnumpyasnp
+from.importolsasOLS
+from.utilsimportoptim_moments,RegressionPropsY,get_spFilter,spdot,set_warn
+from.importuser_outputasUSER
+from.importsummary_outputasSUMMARY
+from.importregimesasREGI
+
+# import warnings
+
+
+__all__=["GM_KKP"]
+
+
+classBaseGM_KKP(RegressionPropsY):
+'''
+ Base GMM method for a spatial random effects panel model based on
+ Kapoor, Kelejian and Prucha (2007) :cite:`KKP2007`.
+
+ Parameters
+ ----------
+ y : array
+ n*tx1 array for dependent variable
+ x : array
+ Two dimensional array with n*t rows and one column for each
+ independent (exogenous) variable
+ (note: must already include constant term)
+ w : spatial weights object
+ Spatial weights matrix
+ full_weights: boolean
+ Considers different weights for each of the 6 moment
+ conditions if True or only 2 sets of weights for the
+ first 3 and the last 3 monent conditions if False (default)
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ vm : array
+ Variance covariance matrix (kxk)
+ """
+ '''
+
+ def__init__(self,y,x,w,full_weights=False):
+ # 1a. OLS --> \tilde{\delta}
+ ols=OLS.BaseOLS(y=y,x=x)
+ self.x,self.y,self.n,self.k,self.xtx=ols.x,ols.y,ols.n,ols.k,ols.xtx
+ N=w.n
+ T=y.shape[0]//N
+ moments,trace_w2=_moments_kkp(w.sparse,ols.u,0)
+ lambda1,sig_v=optim_moments(moments,all_par=True)
+ Tw=SP.kron(SP.identity(T),w.sparse)
+ ub=Tw.dot(ols.u)
+ ulu=ols.u-lambda1*ub
+ Q1=SP.kron(np.ones((T,T))/T,SP.identity(N))
+ sig_1=float(np.dot(ulu.T,Q1.dot(ulu))/N)
+ # print('initial_lamb_sig:',lambda1,sig_v,sig_1)
+ # print('theta:', 1 - np.sqrt(sig_v)/ np.sqrt(sig_1))
+ Xi_a=SP.diags([(sig_v*sig_v)/(T-1),sig_1*sig_1])
+ iffull_weights:
+ Tau=_get_Tau(w.sparse,trace_w2)
+ else:
+ Tau=SP.identity(3)
+ Xi=SP.kron(Xi_a,Tau)
+ moments_b,_=_moments_kkp(w.sparse,ols.u,1,trace_w2)
+ G=np.vstack((np.hstack((moments[0],np.zeros((3,1)))),moments_b[0]))
+ moments6=[G,np.vstack((moments[1],moments_b[1]))]
+ lambda2,sig_vb,sig_1b=optim_moments(
+ moments6,vcX=Xi.toarray(),all_par=True,start=[lambda1,sig_v,sig_1]
+ )
+ # 2a. reg -->\hat{betas}
+ theta=1-np.sqrt(sig_vb)/np.sqrt(sig_1b)
+ # print('theta:', theta)
+ gls_w=SP.identity(N*T)-theta*Q1
+
+ # With omega
+ xs=gls_w.dot(get_spFilter(w,lambda2,x))
+ ys=gls_w.dot(get_spFilter(w,lambda2,y))
+ ols_s=OLS.BaseOLS(y=ys,x=xs)
+ self.predy=spdot(self.x,ols_s.betas)
+ self.u=self.y-self.predy
+ self.vm=ols_s.vm# Check
+ self.betas=np.vstack((ols_s.betas,lambda2,sig_vb,sig_1b))
+ self.e_filtered=self.u-lambda2*SP.kron(SP.identity(T),w.sparse).dot(
+ self.u
+ )
+ self.t,self.n=T,N
+ self._cache={}
+
+
+
+[docs]
+classGM_KKP(BaseGM_KKP,REGI.Regimes_Frame):
+'''
+ GMM method for a spatial random effects panel model based on
+ Kapoor, Kelejian and Prucha (2007) :cite:`KKP2007`.
+
+ Parameters
+ ----------
+ y : array
+ n*tx1 or nxt array for dependent variable
+ x : array
+ Two dimensional array with n*t rows and k columns for
+ independent (exogenous) variable or n rows and k*t columns
+ (note, must not include a constant term)
+ w : spatial weights object
+ Spatial weights matrix, nxn
+ full_weights: boolean
+ Considers different weights for each of the 6 moment
+ conditions if True or only 2 sets of weights for the
+ first 3 and the last 3 moment conditions if False (default)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'y'.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string or list of strings
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_filtered : array
+ nx1 array of spatially filtered residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ t : integer
+ Number of time periods
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ vm : array
+ Variance covariance matrix (kxk)
+ chow : tuple
+ Contains 2 elements. 1: Pair of Wald statistic and p-value
+ for the setup of global regime stability. 2: array with Wald
+ statistic (col 0) and its p-value (col 1) for each beta that
+ varies across regimes.
+ Exists only if regimes is not None.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regime variable for use in the output
+ title : string
+ Name of the regression method used
+ """
+ Examples
+ --------
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+ >>> from spreg import GM_KKP
+ >>> import numpy as np
+ >>> import libpysal
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; The GM_KKP function requires
+ data to be passed in as numpy arrays, hence the user can read their
+ data in using any method.
+ >>> nat = libpysal.examples.load_example('NCOVR')
+ >>> db = libpysal.io.open(nat.get_path("NAT.dbf"),'r')
+ Extract the HR (homicide rates) data in the 70's, 80's and 90's from the DBF file
+ and make it the dependent variable for the regression. Note that the data can also
+ be passed in the long format instead of wide format (i.e. a vector with n*t rows
+ and a single column for the dependent variable and a matrix of dimension n*txk
+ for the independent variables).
+ >>> name_y = ['HR70','HR80','HR90']
+ >>> y = np.array([db.by_col(name) for name in name_y]).T
+ Extract RD and PS in the same time periods from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxk*t numpy array, where k is the number of independent variables (not
+ including a constant) and t is the number of time periods. Data must be
+ organized in a way that all time periods of a given variable are side-by-side
+ and in the correct time order.
+ By default a vector of ones will be added to the independent variables passed in.
+ >>> name_x = ['RD70','RD80','RD90','PS70','PS80','PS90']
+ >>> x = np.array([db.by_col(name) for name in name_x]).T
+ Since we want to run a spatial error panel model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``NAT.shp``.
+ >>> w = libpysal.weights.Queen.from_shapefile(libpysal.examples.get_path("NAT.shp"))
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, his allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+ >>> w.transform = 'r'
+ We are all set with the preliminaries, we are good to run the model. In this
+ case, we will need the variables and the weights matrix. If we want to
+ have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional. In this example
+ we set full_weights to False (the default), indicating that we will use
+ only 2 sets of moments weights for the first 3 and the last 3 moment conditions.
+ >>> reg = GM_KKP(y,x,w,full_weights=False,name_y=name_y, name_x=name_x)
+ Warning: Assuming time data is in wide format, i.e. y[0] refers to T0, y[1], refers to T1, etc.
+ Similarly, assuming x[0:k] refers to independent variables for T0, x[k+1:2k] refers to T1, etc.
+ Once we have run the model, we can explore a little bit the output. We can
+ either request a printout of the results with the command print(reg.summary) or
+ check out the individual attributes of GM_KKP:
+ >>> print(reg.summary)
+ REGRESSION
+ ----------
+ SUMMARY OF OUTPUT: GM SPATIAL ERROR PANEL MODEL - RANDOM EFFECTS (KKP)
+ ----------------------------------------------------------------------
+ Data set : unknown
+ Weights matrix : unknown
+ Dependent Variable : HR Number of Observations: 3085
+ Mean dependent var : 6.4983 Number of Variables : 3
+ S.D. dependent var : 6.9529 Degrees of Freedom : 3082
+ Pseudo R-squared : 0.3248
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error z-Statistic Probability
+ ------------------------------------------------------------------------------------
+ CONSTANT 6.4922156 0.1126713 57.6208690 0.0000000
+ RD 3.6244575 0.0877475 41.3055536 0.0000000
+ PS 1.3118778 0.0852516 15.3883058 0.0000000
+ lambda 0.4177759
+ sigma2_v 22.8190822
+ sigma2_1 39.9099323
+ ------------------------------------------------------------------------------------
+ ================================ END OF REPORT =====================================
+ >>> print(reg.name_x)
+ ['CONSTANT', 'RD', 'PS', 'lambda', ' sigma2_v', 'sigma2_1']
+ The attribute reg.betas contains all the coefficients: betas, the spatial error
+ coefficient lambda, sig2_v and sig2_1:
+ >>> print(np.around(reg.betas,4))
+ [[ 6.4922]
+ [ 3.6245]
+ [ 1.3119]
+ [ 0.4178]
+ [22.8191]
+ [39.9099]]
+ Finally, we can check the standard erros of the betas:
+ >>> print(np.around(np.sqrt(reg.vm.diagonal().reshape(3,1)),4))
+ [[0.1127]
+ [0.0877]
+ [0.0853]]
+ '''
+
+
+
+
+ def_set_regimes(self,w,n_rows):# Must add case for regime_err_sep = True
+ self.constant_regi="many"
+ self.cols2regi="all"
+ self.regime_err_sep=False
+ self.regimes_set=REGI._get_regimes_set(self.regimes)
+ iflen(self.regimes)==w.n:
+ regimes_l=self.regimes*(n_rows//w.n)
+ eliflen(self.regimes)==n_rows:
+ regimes_l=self.regimes
+ else:
+ raiseException("The lenght of 'regimes' must be either equal to n or n*t.")
+ returnregimes_l
+
+
+
+def_moments_kkp(ws,u,i,trace_w2=None):
+"""
+ Compute G and g matrices for the KKP model.
+ ...
+
+ Parameters
+ ----------
+ ws : Sparse matrix
+ Spatial weights sparse matrix
+ u : array
+ Residuals. nx1 array assumed to be aligned with w
+
+ i : integer
+ 0 if Q0, 1 if Q1
+ trace_w2 : float
+ trace of WW. Computed in 1st step and saved for step 2.
+ Returns
+ -------
+ moments : list
+ List of two arrays corresponding to the matrices 'G' and
+ 'g', respectively.
+ trace_w2 : float
+ trace of WW. Computed in 1st step and saved for step 2.
+ """
+ N=ws.shape[0]
+ T=u.shape[0]//N
+ ifi==0:
+ Q=SP.kron(SP.identity(T)-np.ones((T,T))/T,SP.identity(N))
+ else:
+ Q=SP.kron(np.ones((T,T))/T,SP.identity(N))
+ Tw=SP.kron(SP.identity(T),ws)
+ ub=Tw.dot(u)
+ ubb=Tw.dot(ub)
+ Qu=Q.dot(u)
+ Qub=Q.dot(ub)
+ Qubb=Q.dot(ubb)
+ G11=float(2*np.dot(u.T,Qub))
+ G12=float(-np.dot(ub.T,Qub))
+ G21=float(2*np.dot(ubb.T,Qub))
+ G22=float(-np.dot(ubb.T,Qubb))
+ G31=float(np.dot(u.T,Qubb)+np.dot(ub.T,Qub))
+ G32=float(-np.dot(ub.T,Qubb))
+ iftrace_w2==None:
+ trace_w2=(ws.power(2)).sum()
+ G23=((T-1)**(1-i))*trace_w2
+ ifi==0:
+ G=np.array(
+ [[G11,G12,N*(T-1)**(1-i)],[G21,G22,G23],[G31,G32,0]]
+ )/(N*(T-1)**(1-i))
+ else:
+ G=np.array(
+ [
+ [G11,G12,0,N*(T-1)**(1-i)],
+ [G21,G22,0,G23],
+ [G31,G32,0,0],
+ ]
+ )/(N*(T-1)**(1-i))
+ g1=float(np.dot(u.T,Qu))
+ g2=float(np.dot(ub.T,Qub))
+ g3=float(np.dot(u.T,Qub))
+ g=np.array([[g1,g2,g3]]).T/(N*(T-1)**(1-i))
+ return[G,g],trace_w2
+
+
+def_get_Tau(ws,trace_w2):
+"""
+ Computes Tau as in :cite:`KKP2007`.
+ ...
+
+ Parameters
+ ----------
+ ws : Sparse matrix
+ Spatial weights sparse matrix
+ trace_w2 : float
+ trace of WW. Computed in 1st step of _moments_kkp
+ """
+ N=ws.shape[0]
+ T12=2*trace_w2/N
+ wtw=ws.T.dot(ws)
+ T22=wtw.power(2).sum()
+ wtpw=ws.T+ws
+ T23=wtw.multiply(wtpw).sum()
+ d_wwpwtw=ws.multiply(ws.T).sum(0)+wtw.diagonal()
+ T33=d_wwpwtw.sum()
+ Tau=np.array([[2*N,T12,0],[T12,T22,T23],[0,T23,T33]])/N
+ returnTau
+
+
+def_get_panel_data(y,x,w,name_y,name_x):
+"""
+ Performs some checks on the data structure and converts from wide to long if needed.
+ ...
+
+ Parameters
+ ----------
+ y : array
+ n*tx1 or nxt array for dependent variable
+ x : array
+ Two dimensional array with n*t rows and k columns for
+ independent (exogenous) variable or n rows and k*t columns
+ (note, must not include a constant term)
+ name_y : string or list of strings
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ """
+
+ ify.shape[0]/w.n!=y.shape[0]//w.n:
+ raiseException("y must be ntx1 or nxt, and w must be an nxn PySAL W object.")
+ N,T=y.shape[0],y.shape[1]
+ k=x.shape[1]//T
+ ifx.shape[0]!=Nandx.shape[0]!=N*T:
+ raiseException(
+ "X must have either n rows and k*t columns or n*t rows and k columns."
+ )
+ ifx.shape[1]!=kandx.shape[1]!=k*T:
+ raiseException(
+ "X must have either n rows and k*t columns or n*t rows and k columns."
+ )
+ ify.shape[1]>1:
+ message=(
+ "Assuming time data is in wide format, i.e. y[0] refers to T0, y[1], refers to T1, etc."
+ "\n Similarly, assuming x[0:k] refers to independent variables for T0, x[k+1:2k] refers to T1, etc."
+ )
+ print("Warning: "+message)
+ # warnings.warn(message)
+
+ ify.shape[1]!=T:
+ raiseException(
+ "y in wide format must have t columns and be compatible with x's k*t columns."
+ )
+
+ bigy=y.reshape((y.size,1),order="F")
+
+ bigx=x[:,0:T].reshape((N*T,1),order="F")
+ foriinrange(1,k):
+ bigx=np.hstack(
+ (bigx,x[:,T*i:T*(i+1)].reshape((N*T,1),order="F"))
+ )
+ else:
+ bigy,bigx=y,x
+
+ ifname_y:
+ ifnotisinstance(name_y,str)andnotisinstance(name_y,list):
+ raiseException("name_y must either be strings or a list of strings.")
+ iflen(name_y)>1andisinstance(name_y,list):
+ name_y="".join([iforiinname_y[0]ifnoti.isdigit()])
+ iflen(name_y)==1andisinstance(name_y,list):
+ name_y=name_y[0]
+ ifname_x:
+ iflen(name_x)!=k*Tandlen(name_x)!=k:
+ raiseException(
+ "Names of columns in X must have exactly either k or k*t elements."
+ )
+ iflen(name_x)>k:
+ name_bigx=[]
+ foriinrange(k):
+ name_bigx.append("".join([jforjinname_x[i*T]ifnotj.isdigit()]))
+ name_x=name_bigx
+
+ returnbigy,bigx,name_y,name_x
+
+
+def_test():
+ importdoctest
+
+ start_suppress=np.get_printoptions()["suppress"]
+ np.set_printoptions(suppress=True)
+ doctest.testmod()
+ np.set_printoptions(suppress=start_suppress)
+
+
+if__name__=="__main__":
+ _test()
+
+[docs]
+defstge_classic(y,x,w,w_lags=2,robust=None,sig2n_k=True,
+ name_y=False,name_x=False,name_w=False,name_ds=False,latex=False,
+ p_value=0.01,finmod=True,mprint=True):
+"""
+ Classic forward specification: Evaluate results from LM-tests and their robust versions from spreg.OLS.
+ Estimate lag model with AK test if warranted.
+
+ Arguments:
+ ----------
+ y : dependent variable
+ x : matrix of explanatory variables
+ w : spatial weights
+ w_lags : number of lags to be used as instruments in S2SLS
+ robust : White standard errors?
+ sig2n_k : error variance estimate (consistent or unbiased=True)
+ name_y : name of dependent variable (string)
+ name_x : list of strings with x-variable names
+ name_w : string with name for spatial weights
+ name_ds : string with name for data set
+ latex : flag for latex output
+ p_value : significance threshold
+ finmod : flag for estimation of final model
+ mprint : flag for regression summary as search result
+
+ Returns:
+ ----------
+ result: the selected model as a string
+ 0 = OLS
+ 1 = LAG
+ 2 = ERROR
+ 3 = LAGr
+ 4 = ERRORr
+ 5 = LAG_Br
+ 6 = ERROR_Br
+ 7 = LAG_Nr
+ 8 = ERROR_Nr
+ 9 = SARSAR
+ finreg: regression object for final model
+
+ Example:
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import stge_classic
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r")
+ >>> y_var = "CRIME"
+ >>> y = np.array([db.by_col(y_var)]).reshape(49, 1)
+ >>> x_var = ["INC", "HOVAL"]
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = "r"
+ >>> name_y = y_var
+ >>> name_x = x_var
+ >>> name_w = "Rook Weights"
+ >>> name_ds = "Columbus Data"
+
+ >>> result, finreg = stge_classic(y, x, w, mprint=True,
+ ... name_y=name_y, name_x=name_x, name_w=name_w, name_ds=name_ds)
+ Model selected by STGE-Classic: LAG
+ REGRESSION RESULTS
+ ------------------
+ <BLANKLINE>
+ SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES
+ --------------------------------------------------
+ Data set :Columbus Data
+ Weights matrix :Rook Weights
+ Dependent Variable : CRIME Number of Observations: 49
+ Mean dependent var : 35.1288 Number of Variables : 4
+ S.D. dependent var : 16.7321 Degrees of Freedom : 45
+ Pseudo R-squared : 0.6513
+ Spatial Pseudo R-squared: 0.5733
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error z-Statistic Probability
+ ------------------------------------------------------------------------------------
+ CONSTANT 45.45909 10.72499 4.23861 0.00002
+ INC -1.04101 0.37241 -2.79531 0.00519
+ HOVAL -0.25954 0.08855 -2.93085 0.00338
+ W_CRIME 0.41929 0.17977 2.33245 0.01968
+ ------------------------------------------------------------------------------------
+ Instrumented: W_CRIME
+ Instruments: W2_HOVAL, W2_INC, W_HOVAL, W_INC
+ <BLANKLINE>
+ DIAGNOSTICS FOR SPATIAL DEPENDENCE
+ TEST DF VALUE PROB
+ Anselin-Kelejian Test 1 0.130 0.7185
+ <BLANKLINE>
+ SPATIAL LAG MODEL IMPACTS
+ Variable Direct Indirect Total
+ INC -1.0410 -0.7517 -1.7927
+ HOVAL -0.2595 -0.1874 -0.4469
+ ================================ END OF REPORT =====================================
+
+
+ """
+
+ finreg=False
+ p=p_value
+ k=0# indicator for type of final model 0 = OLS; 1 = Lag; 2 = Error; 3 = SAR-SAR; 4 = LAG from SAR
+
+ ifnot(name_y)ornot(name_x):
+
+ model_ols_1=OLS.OLS(y,x,w=w,slx_lags=0,spat_diag=True,
+ name_w=name_w,name_ds=name_ds,latex=latex)
+
+ name_y=model_ols_1.name_y
+ name_x=model_ols_1.name_x[1:]
+
+ else:
+
+ model_ols_1=OLS.OLS(y,x,w=w,slx_lags=0,spat_diag=True,
+ name_y=name_y,name_x=name_x,
+ name_w=name_w,name_ds=name_ds,latex=latex)
+
+
+
+ pvals=[model_ols_1.lm_error[1],model_ols_1.lm_lag[1],
+ model_ols_1.rlm_error[1],model_ols_1.rlm_lag[1],
+ model_ols_1.lm_sarma[1]]
+
+ p_error,p_lag,p_rerror,p_rlag,p_sarma=pvals
+ ifp_lag>=pandp_error>=p:#First test, no LM significant= Stop and keep OLS
+ result='OLS'
+ k=0
+ else:
+ #Just one significant
+ ifp_lag<pandp_error>=p:
+ result='LAG'
+ k=1
+ elifp_lag>=pandp_error<p:
+ result='ERROR'
+ k=2
+ #Both are significant (Check robust version)
+ elifp_lag<pandp_error<p:
+ #One robust significant
+ ifp_rlag<pandp_rerror>=p:
+ result='LAGr'
+ k=1
+ elifp_rlag>=pandp_rerror<p:
+ result='ERRORr'
+ k=2
+ #Both robust are significant (look for the most significant)
+ elifp_rlag<pandp_rerror<p:
+ # check AK in lag model
+ try:
+ model_lag=STSLS.GM_Lag(y,x,w=w,slx_lags=0,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+
+ ak_lag=AKtest(model_lag,w,case='gen')
+ ifak_lag.p<=p:
+ result='SARSAR'
+ k=3
+ elifp_rlag<=p_rerror:
+ result='LAG_BR'
+ k=4
+ elifp_rlag>p_rerror:
+ result='ERROR_Br'
+ k=2
+ except:
+ ifp_rlag<=p_rerror:
+ result='LAG_BR'
+ k=1
+ else:
+ result='ERROR_Br'
+ k=2
+
+ else:#None robust are significant (still look for the 'most significant')
+ ifp_rlag<=p_rerror:
+ result='LAG_Nr'
+ k=4
+ elifp_rlag>p_rerror:
+ result='ERROR_Nr'
+ k=2
+
+ iffinmod:# pass final regression
+ msel="Model selected by STGE-Classic: "
+ ifk==0:# OLS
+ finreg=model_ols_1
+
+ elif(k==1)or(k==4):# LAG
+ try:
+
+ finreg=STSLS.GM_Lag(y,x,w=w,slx_lags=0,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: GM LAG parameters outside bounds"
+ finreg=False
+ elifk==2:# ERROR
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,slx_lags=0,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: GMM Error parameters outside bounds"
+ finreg=False
+ elifk==3:# SARSAR
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,add_wy=True,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: SARSAR parameters outside bounds"
+ finreg=False
+
+# elif k == 4: # LAG already computed
+# finreg = model_lag
+ ifmprint:
+ print(msel+result)
+ ifnot(finreg==False):# cannot print when finreg=False
+ print(finreg.summary)
+
+ return(result,finreg)
+
+
+
+
+[docs]
+defstge_kb(y,x,w,w_lags=2,robust=None,sig2n_k=True,
+ name_y=False,name_x=False,name_w=False,name_ds=False,latex=False,
+ p_value=0.01,finmod=True,mprint=True):
+
+"""
+ Forward specification: Evaluate results from Koley-Bera LM-tests and their robust versions from spreg.OLS.
+
+ Arguments:
+ ----------
+ y : dependent variable
+ x : matrix of explanatory variables
+ w : spatial weights
+ w_lags : number of lags to be used as instruments in S2SLS
+ robust : White standard errors?
+ sig2n_k : error variance estimate (consistent or unbiased=True)
+ name_y : name of dependent variable (string)
+ name_x : list of strings with x-variable names
+ name_w : string with name for spatial weights
+ name_ds : string with name for data set
+ latex : flag for latex output
+ p_value : significance threshold
+ finmod : flag for estimation of final model
+ mprint : flag for regression summary as search result
+
+ Returns:
+ ----------
+ result: the selected model as a string
+ 0 = OLS
+ 1 = LAG
+ 2 = SLX
+ 3 = SDM
+ finreg: regression object for final model
+
+ Example:
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import stge_kb
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r")
+ >>> y_var = "CRIME"
+ >>> y = np.array([db.by_col(y_var)]).reshape(49, 1)
+ >>> x_var = ["INC", "HOVAL"]
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = "r"
+ >>> name_y = y_var
+ >>> name_x = x_var
+ >>> name_w = "Rook Weights"
+ >>> name_ds = "Columbus Data"
+
+ >>> result, finreg = stge_kb(y, x, w, name_y=name_y, name_x=name_x,
+ ... name_w=name_w, name_ds=name_ds, mprint=False)
+ >>> print("Model selected by STGE-KB:",result)
+ Model selected by STGE-KB: OLS
+
+
+ """
+
+
+ finreg=False
+ p=p_value
+ k=0# indicator for type of final model 0 = OLS; 1 = Lag; 2 = SLX; 3 = SDM
+
+ ifnot(name_y)ornot(name_x):
+
+ model_ols_1=OLS.OLS(y,x,w=w,slx_lags=0,spat_diag=True,
+ name_w=name_w,name_ds=name_ds,latex=latex)
+
+ name_y=model_ols_1.name_y
+ name_x=model_ols_1.name_x[1:]
+
+ else:
+
+ model_ols_1=OLS.OLS(y,x,w=w,slx_lags=0,spat_diag=True,
+ name_y=name_y,name_x=name_x,
+ name_w=name_w,name_ds=name_ds,latex=latex)
+
+ pvals=[model_ols_1.rlm_wx[1],model_ols_1.rlm_durlag[1],
+ model_ols_1.lm_spdurbin[1]]
+
+ p_rlwx,p_rdury,p_spdur=pvals
+
+ # first check following KB(2024) - joint test on SDM
+ ifp_spdur>p:# not significant
+ result='OLS'
+ k=0
+ else:# joint test is significant
+ ifp_rlwx<pandp_rdury<p:
+ result='SDM'
+ k=3
+ elifp_rdury<p:# only robust lag
+ result='LAG'
+ k=1
+ elifp_rlwx<p:# only robust WX
+ result='SLX'
+ k=2
+ else:# should never be reached
+ result='OLS'
+ k=0
+
+ iffinmod:# pass final regression
+ msel="Model selected by STGE-KB: "
+ ifk==0:# OLS
+ finreg=model_ols_1
+ elifk==1:# LAG
+ try:
+ finreg=STSLS.GM_Lag(y,x,w=w,slx_lags=0,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: GM Lag parameters outside bounds"
+ finreg=False
+ elifk==2:# SLX
+ finreg=OLS.OLS(y,x,w=w,slx_lags=1,spat_diag=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ elifk==3:# SDM
+ try:
+ finreg=STSLS.GM_Lag(y,x,w=w,slx_lags=1,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: SDM parameters outside bounds"
+ finreg=False
+ ifmprint:
+ print(msel+result)
+ ifnot(finreg==False):# cannot print when finreg=False
+ print(finreg.summary)
+
+ return(result,finreg)
+
+
+
+
+[docs]
+defstge_pre(y,x,w,w_lags=2,robust=None,sig2n_k=True,
+ name_y=False,name_x=False,name_w=False,name_ds=False,latex=False,
+ p_value=0.01,finmod=True,mprint=True):
+
+"""
+ Forward specification: Evaluate results from Koley-Bera LM-tests to decide on OLS vs SLX then
+ proceed as in stge_classic.
+
+ Arguments:
+ ----------
+ y : dependent variable
+ x : matrix of explanatory variables
+ w : spatial weights
+ w_lags : number of lags to be used as instruments in S2SLS
+ robust : White standard errors?
+ sig2n_k : error variance estimate (consistent or unbiased=True)
+ name_y : name of dependent variable (string)
+ name_x : list of strings with x-variable names
+ name_w : string with name for spatial weights
+ name_ds : string with name for data set
+ latex : flag for latex output
+ p_value : significance threshold
+ finmod : flag for estimation of final model
+ mprint : flag for regression summary as search result
+
+ Returns:
+ ----------
+ result: the selected model as a string
+ 0 = OLS
+ 1 = LAG
+ 2 = ERROR
+ 3 = SARMA
+ 4 = SLX
+ 5 = SDM
+ 6 = SLX-ERR
+ 7 = GNS
+ finreg: regression object for final model
+
+ Example:
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import stge_pre
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r")
+ >>> y_var = "CRIME"
+ >>> y = np.array([db.by_col(y_var)]).reshape(49, 1)
+ >>> x_var = ["INC", "HOVAL"]
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = "r"
+ >>> name_y = y_var
+ >>> name_x = x_var
+ >>> name_w = "Rook Weights"
+ >>> name_ds = "Columbus Data"
+
+ >>> result, finreg = stge_pre(y, x, w, name_y=name_y, name_x=name_x,
+ ... name_w=name_w, name_ds=name_ds, mprint=False)
+ >>> print("Model selected by STGE-Pre:",result)
+ Model selected by STGE-Pre: LAG
+
+ """
+
+ finreg=False
+ p=p_value
+ k=0# indicator for type of final model 0 = OLS; 1 = Lag; 2 = Error; 3 = SARSAR;
+ # 4 = SLX; 5 = SDM; 6 = SLX-Err; 7 = GNS
+
+ models=['OLS','LAG','ERROR','SARSAR','SLX','SDM','SLX-ERR','GNS']
+
+ ifnot(name_y)ornot(name_x):
+
+ model_ols_1=OLS.OLS(y,x,w=w,slx_lags=0,spat_diag=True,
+ name_w=name_w,name_ds=name_ds,latex=latex)
+
+ name_y=model_ols_1.name_y
+ name_x=model_ols_1.name_x[1:]
+
+ else:
+
+ model_ols_1=OLS.OLS(y,x,w=w,slx_lags=0,spat_diag=True,
+ name_y=name_y,name_x=name_x,
+ name_w=name_w,name_ds=name_ds,latex=latex)
+
+
+ pv1=model_ols_1.lm_wx[1]# LM test on WX
+ pv2=model_ols_1.rlm_wx[1]# robust LM test in presence of rho
+
+ # selection of OLS or SLX
+ ifpv1<pandpv2<p:# proceed with SLX results
+ slx=1
+ model_ols_1=OLS.OLS(y,x,w=w,slx_lags=1,spat_diag=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ else:# stay with OLS estimation
+ slx=0
+
+ idv=slx*4# keeps track of model
+
+ pvals=[model_ols_1.lm_error[1],model_ols_1.lm_lag[1],
+ model_ols_1.rlm_error[1],model_ols_1.rlm_lag[1],
+ model_ols_1.lm_sarma[1]]
+
+ p_error,p_lag,p_rerror,p_rlag,p_sarma=pvals
+
+ ifp_lag>=pandp_error>=p:#First test, no LM significant= Stop and keep OLS or SLX
+ k=idv+0
+ result=models[k]
+ else:
+ #Just one significant
+ ifp_lag<pandp_error>=p:
+ k=idv+1
+ result=models[k]
+
+ elifp_lag>=pandp_error<p:
+ k=idv+2
+ result=models[k]
+
+ #Both are significant (Check robust version)
+ elifp_lag<pandp_error<p:
+ #One robust significant
+ ifp_rlag<pandp_rerror>=p:
+ k=idv+1
+ result=models[k]
+
+ elifp_rlag>=pandp_rerror<p:
+ k=idv+2
+ result=models[k]
+
+ #Both robust are significant (look for the most significant)
+ elifp_rlag<pandp_rerror<p:
+ # check AK in lag model
+ try:
+ model_lag=STSLS.GM_Lag(y,x,w=w,slx_lags=slx,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+
+ ak_lag=AKtest(model_lag,w,case='gen')
+ ifak_lag.p<=p:
+ k=idv+3
+ result=models[k]
+
+ elifp_rlag<=p_rerror:
+ k=idv+1
+ result=models[k]
+
+ elifp_rlag>p_rerror:
+ k=idv+2
+ result=models[k]
+
+ except:# ignore lag model
+ ifp_rlag<=p_rerror:
+ k=idv+1
+ result=models[1]
+
+ else:
+ k=idv+2
+ result=models[k]
+
+
+ else:#None robust are significant (still look for the 'most significant')
+ ifp_rlag<=p_rerror:
+ k=idv+1
+ result=models[k]
+
+ elifp_rlag>p_rerror:
+ k=idv+2
+ result=models[k]
+
+
+ iffinmod:# pass final regression
+ msel="Model selected by STGE-Pre: "
+ ifk==0ork==4:
+ finreg=model_ols_1
+ elifk==1ork==5:
+ try:
+ finreg=STSLS.GM_Lag(y,x,w=w,slx_lags=slx,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: GM LAG parameters outside bounds"
+ finreg=False
+ elifk==2ork==6:
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,slx_lags=slx,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: GMM Error parameters outside bounds"
+ finreg=False
+ elifk==3ork==7:
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,slx_lags=slx,add_wy=True,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: autoregressive parameters outside bounds"
+ finreg=False
+ ifmprint:
+ print(msel+result)
+ ifnot(finreg==False):# cannot print when finreg=False
+ print(finreg.summary)
+
+ return(result,finreg)
+
+
+
+
+[docs]
+defgets_gns(y,x,w,w_lags=2,robust=None,sig2n_k=True,
+ name_y=False,name_x=False,name_w=False,name_ds=False,latex=False,
+ p_value=0.01,finmod=True,mprint=True):
+
+"""
+ GETS specification starting with GNS model estimation. Estimate simplified model when t-tests are
+ not significant.
+
+ Arguments:
+ ----------
+ y : dependent variable
+ x : matrix of explanatory variables
+ w : spatial weights
+ w_lags : number of lags to be used as instruments in S2SLS
+ robust : White standard errors?
+ sig2n_k : error variance estimate (consistent or unbiased=True)
+ name_y : name of dependent variable (string)
+ name_x : list of strings with x-variable names
+ name_w : string with name for spatial weights
+ name_ds : string with name for data set
+ latex : flag for latex output
+ p_value : significance threshold
+ finmod : flag for estimation of final model
+ mprint : flag for regression summary as search result
+
+ Returns:
+ ----------
+ result: the selected model as a string
+ 0 = OLS
+ 1 = LAG
+ 2 = ERROR
+ 3 = SARSAR
+ 4 = SLX
+ 5 = SDM
+ 6 = SLX-Err
+ 7 = GNS
+ finreg: regression object for final model
+
+ Example:
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import gets_gns
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r")
+ >>> y_var = "CRIME"
+ >>> y = np.array([db.by_col(y_var)]).reshape(49, 1)
+ >>> x_var = ["INC", "HOVAL"]
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = "r"
+ >>> name_y = y_var
+ >>> name_x = x_var
+ >>> name_w = "Rook Weights"
+ >>> name_ds = "Columbus Data"
+
+ >>> result, finreg = gets_gns(y, x, w, name_y=name_y, name_x=name_x,
+ ... name_w=name_w, name_ds=name_ds, mprint=False)
+ >>> print("Model selected by GETS-GNS:",result)
+ Model selected by GETS-GNS: OLS
+
+ """
+
+ finreg=False
+ p=p_value
+
+ k=x.shape[1]
+
+ ifnot(name_y)ornot(name_x):
+
+ try:
+ model_gns=ERROR.GMM_Error(y,x,w=w,slx_lags=1,
+ add_wy=True,w_lags=w_lags,hard_bound=True,
+ name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result='Exception: GNS parameters out of bounds'
+ print(result)
+ return(result,finreg)
+
+ name_y=model_gns.name_y
+ name_x=model_sdm.name_x[1:k+1]
+
+
+ else:
+
+ try:
+ model_gns=ERROR.GMM_Error(y,x,w=w,slx_lags=1,add_wy=True,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result='Exception: GNS parameters out of bounds'
+ print(result)
+ return(result,finreg)
+
+ pstats=np.array(model_gns.z_stat)[1+k:,1]# t statistics p-values
+ pk=len(pstats)# number of p-values, last one is p_lam, next to last p_rho, before that p_gam
+
+ ifpstats.max()<p:# least significant of three is still significant
+ result='GNS'
+ iffinmod:
+ finreg=model_gns
+
+ elifpstats.min()>=p:# all non-significant
+ result='OLS'
+
+ else:# at least one non-significant and one sig spatial parameter
+ # since max is not sig, but (at least) min is
+ cand=pstats.argmax()# least significant is not sig since max > p
+ ifcand==(pk-1):# lambda not significant, but at least one of rho/gamma is
+ # go to spatial Durbin - only rho and gam
+ try:
+ model_sdm=STSLS.GM_Lag(y,x,w=w,slx_lags=1,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result='Exception: SDM parameters out of bounds'
+ print(result)
+ return(result,finreg)
+
+ pstats=np.array(model_sdm.z_stat)[1+k:,1]
+ pk=len(pstats)
+ ifpstats.max()<p:# least significant of two is still significant - SDM candidate
+ # check on spatial common factor
+ ifmodel_sdm.cfh_test[1]<p:# rejected - SDM
+ result='SDM'
+ iffinmod:
+ finreg=model_sdm
+ else:# not reject common factor hypothesis - ERROR
+ result='ERROR'
+
+ elifpstats.min()>=p:# none significant, even bother?
+ result='OLS'
+
+ else:# one significant and one non-sign spatial parameter
+ cand=pstats.argmax()# non-significant one
+ ifcand==(pk-1):# rho not sig
+ result='SLX'
+
+ else:# gamma not sig
+ result='LAG'
+
+ elifcand==(pk-2):# rho not significant, but at least one of lambda/gamma is
+ # go to SLX-Error
+ try:
+ model_slxerr=ERROR.GMM_Error(y,x,w=w,slx_lags=1,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result='Exception: SLX Error parameters out of bounds'
+ print(result)
+ return(result,finreg)
+
+ pstats=np.array(model_slxerr.z_stat)[1+k:,1]
+ pk=len(pstats)
+
+ ifpstats.max()<p:# least significant of two is still significant
+ result='SLX-ERR'
+ iffinmod:
+ finreg=model_slxerr
+
+ elifpstats.min()>=p:# none significant, even bother?
+ result='OLS'
+
+ else:# one significant and one non-sign spatial parameter
+ cand=pstats.argmax()# non-significant one
+ ifcand==(pk-1):# lambda not sig
+ result='SLX'
+
+ else:# gamma not sig
+ result='ERROR'
+
+ else:# gamma not sig, but at least one of rho/lambda is
+ # go to SARSAR
+ try:
+ model_sarsar=ERROR.GMM_Error(y,x,w=w,add_wy=True,slx_lags=0,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result='Exception: SARSAR parameters out of bounds'
+ print(result)
+ return(result,finreg)
+
+
+ pstats=np.array(model_sarsar.z_stat)[1+k:,1]
+ pk=len(pstats)
+ ifpstats.max()<p:# least significant of two is still significant
+ result='SARSAR'
+ iffinmod:
+ finreg=model_sarsar
+ elifpstats.min()>=p:# none significant, even bother?
+ result='OLS'
+
+ else:# one significant and one non-sign spatial parameter
+ cand=pstats.argmax()# non-significant one
+ ifcand==(pk-1):# lambda not sig
+ result='LAG'
+
+ else:# rho not sig
+ result='ERROR'
+
+
+ iffinmod:# pass final regression
+ msel="Model selected by GETS-GNS: "
+ ifresult=='OLS':
+ finreg=OLS.OLS(y,x,w=w,slx_lags=0,spat_diag=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ elifresult=='SLX':
+ finreg=OLS.OLS(y,x,w=w,slx_lags=1,spat_diag=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ elifresult=='ERROR':
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,slx_lags=0,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: GMM Error parameters outside bounds"
+ finreg=False
+ elifresult=='LAG':
+ try:
+ finreg=STSLS.GM_Lag(y,x,w=w,slx_lags=0,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: GM LAG parameters outside bounds"
+ finreg=False
+ ifmprint:
+ print(msel+result)
+ ifnot(finreg==False):# cannot print when finreg=False
+ print(finreg.summary)
+
+ return(result,finreg)
+
+
+
+
+[docs]
+defgets_sdm(y,x,w,w_lags=2,robust=None,sig2n_k=True,
+ name_y=False,name_x=False,name_w=False,name_ds=False,latex=False,
+ p_value=0.01,finmod=True,mprint=True):
+
+"""
+ Hybrid specification search: Starting from the estimation of the Spatial Durbin model,
+ it tests significance of coefficients and carries out specification
+ tests for error autocorrelation to suggest the most appropriate model
+
+ Arguments:
+ ----------
+ y : dependent variable
+ x : matrix of explanatory variables
+ w : spatial weights
+ w_lags : number of lags to be used as instruments in S2SLS
+ robust : White standard errors?
+ sig2n_k : error variance estimate (consistent or unbiased=True)
+ name_y : name of dependent variable (string)
+ name_x : list of strings with x-variable names
+ name_w : string with name for spatial weights
+ name_ds : string with name for data set
+ latex : flag for latex output
+ p_value : significance threshold
+ finmod : flag for estimation of final model
+ mprint : flag for regression summary as search result
+
+ Returns:
+ ----------
+ result: the selected model as a string
+ 0 = OLS
+ 1 = LAG
+ 2 = ERROR
+ 3 = SARSAR
+ 4 = SLX
+ 5 = SDM
+ 6 = SLX-Err
+ 7 = GNS
+ finreg: regression object for final model
+
+ Example:
+ --------
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from spreg import gets_sdm
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r")
+ >>> y_var = "CRIME"
+ >>> y = np.array([db.by_col(y_var)]).reshape(49, 1)
+ >>> x_var = ["INC", "HOVAL"]
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = "r"
+ >>> name_y = y_var
+ >>> name_x = x_var
+ >>> name_w = "Rook Weights"
+ >>> name_ds = "Columbus Data"
+
+ >>> result, finreg = gets_sdm(y, x, w, name_y=name_y, name_x=name_x,
+ ... name_w=name_w, name_ds=name_ds, mprint=False)
+ >>> print("Model selected by GETS-SDM:",result)
+ Model selected by GETS-SDM: OLS
+
+ """
+
+ finreg=False
+ p=p_value
+
+ k=x.shape[1]
+
+ ifnot(name_y)ornot(name_x):
+
+ try:
+ model_sdm=STSLS.GM_Lag(y,x,w=w,slx_lags=1,w_lags=w_lags,
+ hard_bound=True,
+ name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result='Exception: SDM parameters out of bounds'
+ print(result)
+ return(result,finreg)
+
+ name_y=model_sdm.name_y
+ name_x=model_sdm.name_x[1:k+1]
+
+ else:
+
+ try:
+ model_sdm=STSLS.GM_Lag(y,x,w=w,slx_lags=1,w_lags=w_lags,
+ hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result='Exception: SDM parameters out of bounds'
+ print(result)
+ return(result,finreg)
+
+
+
+ pstats=np.array(model_sdm.z_stat)[1+k:,1]# spatial parameters
+ pk=len(pstats)
+
+ ifpstats.max()<p:# least significant of two is still significant = SDM or GNS
+ # check on spatial common factor
+ ifmodel_sdm.cfh_test[1]>=p:# not rejected - ERROR
+ result='ERROR'
+
+ else:# could be GNS
+ ak_sdm=AKtest(model_sdm,w,case='gen')
+ ifak_sdm.p<p:# remaining error
+ result='GNS'
+
+ else:
+ result='SDM'
+ iffinmod:
+ finreg=model_sdm
+
+ elifpstats.min()>=p:# none significant - OLS or SEM
+ model_ols=OLS.OLS(y,x,w=w,spat_diag=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+
+ # check on LM-Error
+ errtest=LMtests(model_ols,w)
+ iferrtest.lme[1]<p:# ERROR
+ result='ERROR'
+
+ else:
+ result='OLS'
+ iffinmod:
+ finreg=model_ols
+
+ else:# one significant and one non-sign spatial parameter
+ cand=pstats.argmax()# non-significant one
+ ifcand==(pk-1):# rho not sig, SLX model
+ # check error in SLX
+ model_slx=OLS.OLS(y,x,w=w,slx_lags=1,spat_diag=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+
+ errtest=LMtests(model_slx,w)
+ iferrtest.lme[1]<p:# SLX-ERROR
+ result='SLX-Err'
+
+ else:
+ result='SLX'
+ iffinmod:
+ finreg=model_slx
+ else:# gamma not sign, lag model
+ try:
+ model_lag=STSLS.GM_Lag(y,x,w=w,slx_lags=0,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result='Exception: LAG parameters out of bounds'
+ print(result)
+ return(result,finreg)
+ #print(model_lag.summary)
+ ak_lag=AKtest(model_lag,w,case='gen')
+ ifak_lag.p<p:# remaining error
+ result='SARSAR'
+
+ else:# no error
+ result='LAG'
+ iffinmod:
+ finreg=model_lag
+
+
+ iffinmod:# pass final regression
+ msel="Model selected by GETS-SDM: "
+ ifresult=='ERROR':
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,slx_lags=0,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+" -- Exception: GMM Error parameters outside bounds"
+ finreg=False
+ elifresult=='SARSAR':
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,add_wy=True,slx_lags=0,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+' -- Exception: SARSAR parameters out of bounds'
+ return(result,finreg)
+ elifresult=='SLX-Err':
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,slx_lags=1,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+' -- Exception: SLX Error parameters out of bounds'
+ return(result,finreg)
+ elifresult=='GNS':
+ try:
+ finreg=ERROR.GMM_Error(y,x,w=w,slx_lags=1,add_wy=True,w_lags=w_lags,hard_bound=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds,latex=latex)
+ except:
+ result=result+' -- Exception: GNS parameters out of bounds'
+ ifmprint:
+ print(msel+result)
+ ifnot(finreg==False):# cannot print when finreg=False
+ print(finreg.summary)
+
+ return(result,finreg)
+
+
+
+def_test():
+ importdoctest
+
+ # the following line could be used to define an alternative to the '<BLANKLINE>' flag
+ # doctest.BLANKLINE_MARKER = 'something better than <BLANKLINE>'
+ start_suppress=np.get_printoptions()["suppress"]
+ np.set_printoptions(suppress=True)
+ doctest.testmod()
+ np.set_printoptions(suppress=start_suppress)
+
+
+if__name__=="__main__":
+ _test()
+
+ importnumpyasnp
+ importlibpysal
+ fromspregimportstge_classic
+
+ # Load data
+ db=libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),"r")
+
+ # Define the dependent variable
+ y_var="CRIME"
+ y=np.array([db.by_col(y_var)]).reshape(49,1)
+
+ # Define the explanatory variables
+ x_var=["INC","HOVAL"]
+ x=np.array([db.by_col(name)fornameinx_var]).T
+
+ # Define the spatial weights
+ w=libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ w.transform="r"
+
+ # Define the names for the variables and the dataset
+ name_y=y_var
+ name_x=x_var
+ name_w="Rook Weights"
+ name_ds="Columbus Data"
+
+ # Call the stge_classic function and output the results
+ result,finreg=stge_classic(y,x,w,mprint=True,
+ name_y=name_y,name_x=name_x,name_w=name_w,name_ds=name_ds)
+
+
+"""
+SUR and 3SLS estimation
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, \
+ Pedro V. Amaral pedrovma@gmail.com"
+
+
+importnumpyasnp
+importnumpy.linalgasla
+from.importsummary_outputasSUMMARY
+from.importuser_outputasUSER
+from.importregimesasREGI
+from.sur_utilsimport(
+ sur_dictxy,
+ sur_dictZ,
+ sur_corr,
+ sur_crossprod,
+ sur_est,
+ sur_resids,
+ sur_predict,
+ check_k,
+)
+from.diagnostics_surimport(
+ sur_setp,
+ sur_lrtest,
+ sur_lmtest,
+ surLMe,
+ surLMlag,
+ sur_chow,
+)
+from.sputilsimportsphstack,spdot
+
+
+__all__=["SUR","ThreeSLS"]
+
+
+classBaseSUR:
+"""
+ Base class for SUR estimation, both two step as well as iterated
+
+ Parameters
+ ----------
+
+ bigy : dictionary
+ with vector for dependent variable by equation
+ bigX : dictionary
+ with matrix of explanatory variables by equation
+ (note, already includes constant term)
+ iter : boolean
+ whether or not to use iterated estimation.
+ default = False
+ maxiter : int
+ maximum iterations; default = 5
+ epsilon : float
+ precision criterion to end iterations.
+ default = 0.00001
+ verbose : boolean
+ flag to print out iteration number and value of log det(sig)
+ at the beginning and the end of the iteration
+
+ Attributes
+ ----------
+ bigy : dictionary
+ with y values
+ bigX : dictionary
+ with X values
+ bigXX : dictionary
+ with :math:`X_t'X_r` cross-products
+ bigXy : dictionary
+ with :math:`X_t'y_r` cross-products
+ n_eq : int
+ number of equations
+ n : int
+ number of observations in each cross-section
+ bigK : array
+ vector with number of explanatory variables (including constant)
+ for each equation
+ bOLS : dictionary
+ with OLS regression coefficients for each equation
+ olsE : array
+ N x n_eq array with OLS residuals for each equation
+ bSUR : dictionary
+ with SUR regression coefficients for each equation
+ varb : array
+ variance-covariance matrix
+ bigE : array
+ N x n_eq array with SUR residuals for each equation
+ bigYP : array
+ N x n_eq array with SUR predicted values for each equation
+ sig : array
+ Sigma matrix of inter-equation error covariances
+ ldetS1 : float
+ log det(Sigma) for SUR model
+ resids : array
+ n by n_eq array of residuals
+ sig_ols : array
+ Sigma matrix for OLS residuals
+ ldetS0 : float
+ log det(Sigma) for null model (OLS by equation, diagonals only)
+ niter : int
+ number of iterations (=0 for iter=False)
+ corr : array
+ inter-equation SUR error correlation matrix
+ llik : float
+ log-likelihood (including the constant pi)
+ """
+
+ def__init__(
+ self,bigy,bigX,iter=False,maxiter=5,epsilon=0.00001,verbose=False
+ ):
+ # setting up the cross-products
+ self.bigy=bigy
+ self.bigX=bigX
+ self.n_eq=len(bigy.keys())
+ self.n=bigy[0].shape[0]
+ self.bigK=np.zeros((self.n_eq,1),dtype=np.int_)
+ forrinrange(self.n_eq):
+ self.bigK[r]=self.bigX[r].shape[1]
+ self.bigXX,self.bigXy=sur_crossprod(self.bigX,self.bigy)
+ # OLS regression by equation, sets up initial residuals
+ _sur_ols(self)# creates self.bOLS and self.olsE
+ # SUR estimation using OLS residuals - two step estimation
+ self.bSUR,self.varb,self.sig=sur_est(
+ self.bigXX,self.bigXy,self.olsE,self.bigK
+ )
+ resids=sur_resids(self.bigy,self.bigX,self.bSUR)# matrix of residuals
+ # Sigma and log det(Sigma) for null model
+ self.sig_ols=self.sig
+ sols=np.diag(np.diag(self.sig))
+ self.ldetS0=np.log(np.diag(sols)).sum()
+ det0=self.ldetS0
+ # setup for iteration
+ det1=la.slogdet(self.sig)[1]
+ self.ldetS1=det1
+ # self.niter = 0
+ ifiter:# iterated FGLS aka ML
+ n_iter=0
+ whilenp.abs(det1-det0)>epsilonandn_iter<=maxiter:
+ n_iter+=1
+ det0=det1
+ self.bSUR,self.varb,self.sig=sur_est(
+ self.bigXX,self.bigXy,resids,self.bigK
+ )
+ resids=sur_resids(self.bigy,self.bigX,self.bSUR)
+ det1=la.slogdet(self.sig)[1]
+ ifverbose:
+ print(n_iter,det0,det1)
+ self.bigE=sur_resids(self.bigy,self.bigX,self.bSUR)
+ self.ldetS1=det1
+ self.niter=n_iter
+ else:
+ self.niter=1
+ self.bigE=resids
+ self.bigYP=sur_predict(self.bigy,self.bigX,self.bSUR)# LA added 10/30/16
+ self.corr=sur_corr(self.sig)
+ lik=self.n_eq*(1.0+np.log(2.0*np.pi))+self.ldetS1
+ self.llik=-(self.n/2.0)*lik
+
+
+def_sur_ols(reg):
+"""
+ OLS estimation of SUR equations
+
+ Parameters
+ ----------
+ reg : BaseSUR object
+
+ Return
+ -------
+ reg.bOLS : dictionary
+ with regression coefficients for each equation
+ reg.olsE : array
+ N x n_eq array with OLS residuals for each equation
+
+ """
+ reg.bOLS={}
+ forrinrange(reg.n_eq):
+ reg.bOLS[r]=np.dot(la.inv(reg.bigXX[(r,r)]),reg.bigXy[(r,r)])
+ reg.olsE=sur_resids(reg.bigy,reg.bigX,reg.bOLS)
+ returnreg
+
+
+
+[docs]
+classSUR(BaseSUR,REGI.Regimes_Frame):
+"""
+ User class for SUR estimation, both two step as well as iterated
+
+ Parameters
+ ----------
+ bigy : list or dictionary
+ list with the name of the dependent variable for each equation
+ or dictionary with vectors for dependent variable by equation
+ bigX : list or dictionary
+ list of lists the name of the explanatory variables for each equation
+ or dictionary with matrix of explanatory variables by equation
+ (note, already includes constant term)
+ db : Pandas DataFrame
+ Optional. Required in case bigy and bigX are lists with names of variables
+ w : spatial weights object
+ default = None
+ regimes : list
+ default = None.
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ nonspat_diag: boolean
+ flag for non-spatial diagnostics, default = True
+ spat_diag : boolean
+ flag for spatial diagnostics, default = False
+ iter : boolean
+ whether or not to use iterated estimation.
+ default = False
+ maxiter : int
+ maximum iterations; default = 5
+ epsilon : float
+ precision criterion to end iterations.
+ default = 0.00001
+ verbose : boolean
+ flag to print out iteration number and value
+ of log det(sig) at the beginning and the end of the iteration
+ name_bigy : dictionary
+ with name of dependent variable for each equation.
+ default = None, but should be specified
+ is done when sur_stackxy is used
+ name_bigX : dictionary
+ with names of explanatory variables for each equation.
+ default = None, but should be specified
+ is done when sur_stackxy is used
+ name_ds : string
+ name for the data set
+ name_w : string
+ name for the weights file
+ name_regimes : string
+ name of regime variable for use in the output
+
+ Attributes
+ ----------
+ bigy : dictionary
+ with y values
+ bigX : dictionary
+ with X values
+ bigXX : dictionary
+ with :math:`X_t'X_r` cross-products
+ bigXy : dictionary
+ with :math:`X_t'y_r` cross-products
+ n_eq : int
+ number of equations
+ n : int
+ number of observations in each cross-section
+ bigK : array
+ vector with number of explanatory variables (including constant)
+ for each equation
+ bOLS : dictionary
+ with OLS regression coefficients for each equation
+ olsE : array
+ N x n_eq array with OLS residuals for each equation
+ bSUR : dictionary
+ with SUR regression coefficients for each equation
+ varb : array
+ variance-covariance matrix
+ bigE : array
+ n by n_eq array of residuals
+ sig_ols : array
+ Sigma matrix for OLS residuals (diagonal)
+ ldetS0 : float
+ log det(Sigma) for null model (OLS by equation)
+ niter : int
+ number of iterations (=0 for iter=False)
+ corr : array
+ inter-equation error correlation matrix
+ llik : float
+ log-likelihood (including the constant pi)
+ sur_inf : dictionary
+ with standard error, asymptotic t and p-value,
+ one for each equation
+ lrtest : tuple
+ Likelihood Ratio test on off-diagonal elements of sigma
+ (tuple with test,df,p-value)
+ lmtest : tuple
+ Lagrange Multipler test on off-diagonal elements of sigma
+ (tuple with test,df,p-value)
+ lmEtest : tuple
+ Lagrange Multiplier test on error spatial autocorrelation in SUR
+ (tuple with test, df, p-value)
+ lmlagtest : tuple
+ Lagrange Multiplier test on spatial lag autocorrelation in SUR
+ (tuple with test, df, p-value)
+ surchow : array
+ list with tuples for Chow test on regression coefficients.
+ each tuple contains test value, degrees of freedom, p-value
+ name_bigy : dictionary
+ with name of dependent variable for each equation
+ name_bigX : dictionary
+ with names of explanatory variables for each
+ equation
+ name_ds : string
+ name for the data set
+ name_w : string
+ name for the weights file
+ name_regimes : string
+ name of regime variable for use in the output
+
+
+ Examples
+ --------
+
+ >>> import libpysal
+ >>> import geopandas as gpd
+ >>> from spreg import SUR
+
+ Open data on NCOVR US County Homicides (3085 areas) from libpysal examples using geopandas.
+
+ >>> nat = libpysal.examples.load_example('Natregimes')
+ >>> df = gpd.read_file(nat.get_path("natregimes.shp"))
+
+ The specification of the model to be estimated can be provided as lists.
+ Each equation should be listed separately. In this example, equation 1
+ has HR80 as dependent variable and PS80 and UE80 as exogenous regressors.
+ For equation 2, HR90 is the dependent variable, and PS90 and UE90 the
+ exogenous regressors.
+
+ >>> y_var = ['HR80','HR90']
+ >>> x_var = [['PS80','UE80'],['PS90','UE90']]
+
+ Although not required for this method, we can create a weights matrix
+ to allow for spatial diagnostics.
+
+ >>> w = libpysal.weights.Queen.from_dataframe(df)
+ >>> w.transform='r'
+
+ We can now run the regression and then have a summary of the output by typing:
+ 'print(reg.summary)'
+
+ >>> reg = SUR(y_var,x_var,df=df,w=w,spat_diag=True,name_ds="nat")
+ >>> print(reg.summary)
+ REGRESSION
+ ----------
+ SUMMARY OF OUTPUT: SEEMINGLY UNRELATED REGRESSIONS (SUR)
+ --------------------------------------------------------
+ Data set : nat
+ Weights matrix : unknown
+ Number of Equations : 2 Number of Observations: 3085
+ Log likelihood (SUR): -19902.966 Number of Iterations : 1
+ ----------
+ <BLANKLINE>
+ SUMMARY OF EQUATION 1
+ ---------------------
+ Dependent Variable : HR80 Number of Variables : 3
+ Mean dependent var : 6.9276 Degrees of Freedom : 3082
+ S.D. dependent var : 6.8251
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error z-Statistic Probability
+ ------------------------------------------------------------------------------------
+ Constant_1 5.1390718 0.2624673 19.5798587 0.0000000
+ PS80 0.6776481 0.1219578 5.5564132 0.0000000
+ UE80 0.2637240 0.0343184 7.6846277 0.0000000
+ ------------------------------------------------------------------------------------
+ <BLANKLINE>
+ SUMMARY OF EQUATION 2
+ ---------------------
+ Dependent Variable : HR90 Number of Variables : 3
+ Mean dependent var : 6.1829 Degrees of Freedom : 3082
+ S.D. dependent var : 6.6403
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error z-Statistic Probability
+ ------------------------------------------------------------------------------------
+ Constant_2 3.6139403 0.2534996 14.2561949 0.0000000
+ PS90 1.0260715 0.1121662 9.1477755 0.0000000
+ UE90 0.3865499 0.0341996 11.3027760 0.0000000
+ ------------------------------------------------------------------------------------
+ <BLANKLINE>
+ <BLANKLINE>
+ REGRESSION DIAGNOSTICS
+ TEST DF VALUE PROB
+ LM test on Sigma 1 680.168 0.0000
+ LR test on Sigma 1 768.385 0.0000
+ <BLANKLINE>
+ OTHER DIAGNOSTICS - CHOW TEST BETWEEN EQUATIONS
+ VARIABLES DF VALUE PROB
+ Constant_1, Constant_2 1 26.729 0.0000
+ PS80, PS90 1 8.241 0.0041
+ UE80, UE90 1 9.384 0.0022
+ <BLANKLINE>
+ DIAGNOSTICS FOR SPATIAL DEPENDENCE
+ TEST DF VALUE PROB
+ Lagrange Multiplier (error) 2 1333.586 0.0000
+ Lagrange Multiplier (lag) 2 1275.821 0.0000
+ <BLANKLINE>
+ ERROR CORRELATION MATRIX
+ EQUATION 1 EQUATION 2
+ 1.000000 0.469548
+ 0.469548 1.000000
+ ================================ END OF REPORT =====================================
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ bigy,
+ bigX,
+ df=None,
+ w=None,
+ regimes=None,
+ nonspat_diag=True,
+ spat_diag=False,
+ vm=False,
+ iter=False,
+ maxiter=5,
+ epsilon=0.00001,
+ verbose=False,
+ name_bigy=None,
+ name_bigX=None,
+ name_ds=None,
+ name_w=None,
+ name_regimes=None,
+ ):
+
+ ifisinstance(bigy,list)orisinstance(bigX,list):
+ ifisinstance(bigy,list)andisinstance(bigX,list):
+ iflen(bigy)==len(bigX):
+ ifdfisnotNone:
+ bigy,bigX,name_bigy,name_bigX=sur_dictxy(df,bigy,bigX)
+ else:
+ raiseException("Error: df argument is required if bigy and bigX are lists")
+ else:
+ raiseException("Error: bigy and bigX must have the same number of elements")
+ else:
+ raiseException("Error: bigy and bigX must be both lists or both dictionaries")
+
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.name_w=USER.set_name_w(name_w,w)
+ self.n_eq=len(bigy.keys())
+
+ # initialize names - should be generated by sur_stack
+ ifname_bigy:
+ self.name_bigy=name_bigy
+ else:# need to construct y names
+ self.name_bigy={}
+ forrinrange(self.n_eq):
+ yn="dep_var_"+str(r)
+ self.name_bigy[r]=yn
+ ifname_bigXisNone:
+ name_bigX={}
+ forrinrange(self.n_eq):
+ #k = self.bigX[r].shape[1] - 1
+ k=bigX[r].shape[1]-1
+ name_x=["var_"+str(i+1)+"_"+str(r)foriinrange(k)]
+ ct="Constant_"+str(r)# NOTE: constant always included in X
+ name_x.insert(0,ct)
+ name_bigX[r]=name_x
+
+ ifregimesisnotNone:
+ self.constant_regi="many"
+ self.cols2regi="all"
+ self.regime_err_sep=False
+ self.name_regimes=USER.set_name_ds(name_regimes)
+ self.regimes_set=REGI._get_regimes_set(regimes)
+ self.regimes=regimes
+ self.name_x_r=name_bigX
+ cols2regi_dic={}
+ self.name_bigX={}
+ forrinrange(self.n_eq):
+ cols2regi_dic[r]=REGI.check_cols2regi(
+ self.constant_regi,self.cols2regi,bigX[r],add_cons=False
+ )
+ USER.check_regimes(self.regimes_set,bigy[0].shape[0],bigX[r].shape[1])
+ bigX[r],self.name_bigX[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigX[r],
+ regimes,
+ constant_regi=None,
+ cols2regi=cols2regi_dic[r],
+ names=name_bigX[r],
+ )
+ else:
+ self.name_bigX=name_bigX
+
+ # need checks on match between bigy, bigX dimensions
+ # init moved here before name check
+ BaseSUR.__init__(
+ self,
+ bigy=bigy,
+ bigX=bigX,
+ iter=iter,
+ maxiter=maxiter,
+ epsilon=epsilon,
+ verbose=verbose,
+ )
+
+ # inference
+ self.sur_inf=sur_setp(self.bSUR,self.varb)
+
+ ifnonspat_diag:
+ # LR test on off-diagonal elements of Sigma
+ self.lrtest=sur_lrtest(self.n,self.n_eq,self.ldetS0,self.ldetS1)
+
+ # LM test on off-diagonal elements of Sigma
+ self.lmtest=sur_lmtest(self.n,self.n_eq,self.sig_ols)
+ else:
+ self.lrtest=None
+ self.lmtest=None
+
+ ifspat_diag:
+ ifnotw:
+ raiseException("Error: spatial weights needed")
+ WS=w.sparse
+ # LM test on spatial error autocorrelation
+ self.lmEtest=surLMe(self.n_eq,WS,self.bigE,self.sig)
+ # LM test on spatial lag autocorrelation
+ self.lmlagtest=surLMlag(
+ self.n_eq,
+ WS,
+ self.bigy,
+ self.bigX,
+ self.bigE,
+ self.bigYP,
+ self.sig,
+ self.varb,
+ )
+ else:
+ self.lmEtest=None
+ self.lmlagtest=None
+
+ # test on constancy of coefficients across equations
+ ifcheck_k(self.bigK):# only for equal number of variables
+ self.surchow=sur_chow(self.n_eq,self.bigK,self.bSUR,self.varb)
+ else:
+ self.surchow=None
+
+ # Listing of the results
+ self.title="SEEMINGLY UNRELATED REGRESSIONS (SUR)"
+ ifregimesisnotNone:
+ self.title+=" - REGIMES"
+ self.chow_regimes={}
+ varb_counter=0
+ forrinrange(self.n_eq):
+ counter_end=varb_counter+self.bSUR[r].shape[0]
+ self.chow_regimes[r]=REGI._chow_run(
+ len(cols2regi_dic[r]),
+ 0,
+ 0,
+ len(self.regimes_set),
+ self.bSUR[r],
+ self.varb[varb_counter:counter_end,varb_counter:counter_end],
+ )
+ varb_counter=counter_end
+ regimes=True
+
+ SUMMARY.SUR(
+ reg=self,
+ nonspat_diag=nonspat_diag,
+ spat_diag=spat_diag,
+ surlm=True,
+ regimes=regimes,
+ )
+
+
+
+
+classBaseThreeSLS:
+"""
+ Base class for 3SLS estimation, two step
+
+ Parameters
+ ----------
+ bigy : dictionary
+ with vector for dependent variable by equation
+ bigX : dictionary
+ with matrix of explanatory variables by equation
+ (note, already includes constant term)
+ bigyend : dictionary
+ with matrix of endogenous variables by equation
+ bigq : dictionary
+ with matrix of instruments by equation
+
+ Attributes
+ ----------
+ bigy : dictionary
+ with y values
+ bigZ : dictionary
+ with matrix of exogenous and endogenous variables
+ for each equation
+ bigZHZH : dictionary
+ with matrix of cross products Zhat_r'Zhat_s
+ bigZHy : dictionary
+ with matrix of cross products Zhat_r'y_end_s
+ n_eq : int
+ number of equations
+ n : int
+ number of observations in each cross-section
+ bigK : array
+ vector with number of explanatory variables (including constant,
+ exogenous and endogenous) for each equation
+ b2SLS : dictionary
+ with 2SLS regression coefficients for each equation
+ tslsE : array
+ N x n_eq array with OLS residuals for each equation
+ b3SLS : dictionary
+ with 3SLS regression coefficients for each equation
+ varb : array
+ variance-covariance matrix
+ sig : array
+ Sigma matrix of inter-equation error covariances
+ bigE : array
+ n by n_eq array of residuals
+ corr : array
+ inter-equation 3SLS error correlation matrix
+
+ """
+
+ def__init__(self,bigy,bigX,bigyend,bigq):
+ # setting up the cross-products
+ self.bigy=bigy
+ self.n_eq=len(bigy.keys())
+ self.n=bigy[0].shape[0]
+ # dictionary with exog and endog, Z
+ self.bigZ={}
+ forrinrange(self.n_eq):
+ self.bigZ[r]=sphstack(bigX[r],bigyend[r])
+ # number of explanatory variables by equation
+ self.bigK=np.zeros((self.n_eq,1),dtype=np.int_)
+ forrinrange(self.n_eq):
+ self.bigK[r]=self.bigZ[r].shape[1]
+ # dictionary with instruments, H
+ bigH={}
+ forrinrange(self.n_eq):
+ bigH[r]=sphstack(bigX[r],bigq[r])
+ # dictionary with instrumental variables, X and yend_predicted, Z-hat
+ bigZhat=_get_bigZhat(self,bigX,bigyend,bigH)
+ self.bigZHZH,self.bigZHy=sur_crossprod(bigZhat,self.bigy)
+
+ # 2SLS regression by equation, sets up initial residuals
+ _sur_2sls(self)# creates self.b2SLS and self.tslsE
+
+ self.b3SLS,self.varb,self.sig=sur_est(
+ self.bigZHZH,self.bigZHy,self.tslsE,self.bigK
+ )
+ self.bigE=sur_resids(self.bigy,self.bigZ,self.b3SLS)# matrix of residuals
+
+ # inter-equation correlation matrix
+ self.corr=sur_corr(self.sig)
+
+
+
+[docs]
+classThreeSLS(BaseThreeSLS,REGI.Regimes_Frame):
+"""
+ User class for 3SLS estimation
+
+ Parameters
+ ----------
+ bigy : list or dictionary
+ list with the names of the dependent variable for each equation
+ or dictionary with vectors for dependent variable by equation
+ bigX : list or dictionary
+ list of lists the names of the explanatory variables for each equation
+ or dictionary with matrix of explanatory variables by equation
+ (note, already includes constant term)
+ bigyend : list or dictionary
+ list of lists the names of the endogenous variables for each equation
+ or dictionary with matrix of endogenous variables by equation
+ bigq : list or dictionary
+ list of lists the names of the instrument variables for each equation
+ or dictionary with matrix of instruments by equation
+ db : Pandas DataFrame
+ Optional. Required in case bigy and bigX are lists with names of variables
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ nonspat_diag: boolean
+ flag for non-spatial diagnostics, default = True.
+ name_bigy : dictionary
+ with name of dependent variable for each equation.
+ default = None, but should be specified.
+ is done when sur_stackxy is used
+ name_bigX : dictionary
+ with names of explanatory variables for each equation.
+ default = None, but should be specified.
+ is done when sur_stackxy is used
+ name_bigyend : dictionary
+ with names of endogenous variables for each equation.
+ default = None, but should be specified.
+ is done when sur_stackZ is used
+ name_bigq : dictionary
+ with names of instrumental variables for each equation.
+ default = None, but should be specified.
+ is done when sur_stackZ is used.
+ name_ds : string
+ name for the data set.
+ name_regimes : string
+ name of regime variable for use in the output.
+
+ Attributes
+ ----------
+
+ bigy : dictionary
+ with y values
+ bigZ : dictionary
+ with matrix of exogenous and endogenous variables
+ for each equation
+ bigZHZH : dictionary
+ with matrix of cross products Zhat_r'Zhat_s
+ bigZHy : dictionary
+ with matrix of cross products Zhat_r'y_end_s
+ n_eq : int
+ number of equations
+ n : int
+ number of observations in each cross-section
+ bigK : array
+ vector with number of explanatory variables (including constant,
+ exogenous and endogenous) for each equation
+ b2SLS : dictionary
+ with 2SLS regression coefficients for each equation
+ tslsE : array
+ N x n_eq array with OLS residuals for each equation
+ b3SLS : dictionary
+ with 3SLS regression coefficients for each equation
+ varb : array
+ variance-covariance matrix
+ sig : array
+ Sigma matrix of inter-equation error covariances
+ bigE : array
+ n by n_eq array of residuals
+ corr : array
+ inter-equation 3SLS error correlation matrix
+ tsls_inf : dictionary
+ with standard error, asymptotic t and p-value,
+ one for each equation
+ surchow : array
+ list with tuples for Chow test on regression coefficients
+ each tuple contains test value, degrees of freedom, p-value
+ name_ds : string
+ name for the data set
+ name_bigy : dictionary
+ with name of dependent variable for each equation
+ name_bigX : dictionary
+ with names of explanatory variables for each
+ equation
+ name_bigyend : dictionary
+ with names of endogenous variables for each
+ equation
+ name_bigq : dictionary
+ with names of instrumental variables for each
+ equations
+ name_regimes : string
+ name of regime variable for use in the output
+
+
+ Examples
+ --------
+
+ >>> import libpysal
+ >>> import geopandas as gpd
+ >>> from spreg import ThreeSLS
+ >>> import numpy as np
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+
+ Open data on NCOVR US County Homicides (3085 areas) from libpysal examples using geopandas.
+
+ >>> nat = libpysal.examples.load_example('Natregimes')
+ >>> df = gpd.read_file(nat.get_path("natregimes.shp"))
+
+ The specification of the model to be estimated can be provided as lists.
+ Each equation should be listed separately. In this example, equation 1
+ has HR80 as dependent variable, PS80 and UE80 as exogenous regressors,
+ RD80 as endogenous regressor and FP79 as additional instrument.
+ For equation 2, HR90 is the dependent variable, PS90 and UE90 the
+ exogenous regressors, RD90 as endogenous regressor and FP99 as
+ additional instrument
+
+ >>> y_var = ['HR80','HR90']
+ >>> x_var = [['PS80','UE80'],['PS90','UE90']]
+ >>> yend_var = [['RD80'],['RD90']]
+ >>> q_var = [['FP79'],['FP89']]
+
+ We can now run the regression and then have a summary of the output by typing:
+ print(reg.summary)
+
+ Alternatively, we can just check the betas and standard errors, asymptotic t
+ and p-value of the parameters:
+
+ >>> reg = ThreeSLS(y_var,x_var,yend_var,q_var,df=df,name_ds="NAT")
+ >>> reg.b3SLS
+ {0: array([[6.92426353],
+ [1.42921826],
+ [0.00049435],
+ [3.5829275 ]]), 1: array([[ 7.62385875],
+ [ 1.65031181],
+ [-0.21682974],
+ [ 3.91250428]])}
+
+ >>> reg.tsls_inf
+ {0: array([[ 0.23220853, 29.81916157, 0. ],
+ [ 0.10373417, 13.77770036, 0. ],
+ [ 0.03086193, 0.01601807, 0.98721998],
+ [ 0.11131999, 32.18584124, 0. ]]), 1: array([[ 0.28739415, 26.52753638, 0. ],
+ [ 0.09597031, 17.19606554, 0. ],
+ [ 0.04089547, -5.30204786, 0.00000011],
+ [ 0.13586789, 28.79638723, 0. ]])}
+
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ bigy,
+ bigX,
+ bigyend,
+ bigq,
+ df=None,
+ regimes=None,
+ nonspat_diag=True,
+ name_bigy=None,
+ name_bigX=None,
+ name_bigyend=None,
+ name_bigq=None,
+ name_ds=None,
+ name_regimes=None,
+ ):
+
+ ifisinstance(bigy,list)orisinstance(bigX,list)orisinstance(bigyend,list)orisinstance(bigq,list):
+ ifisinstance(bigy,list)andisinstance(bigX,list)andisinstance(bigyend,list)andisinstance(bigq,list):
+ iflen(bigy)==len(bigX)==len(bigyend)==len(bigq):
+ ifdfisnotNone:
+ bigy,bigX,name_bigy,name_bigX=sur_dictxy(df,bigy,bigX)
+ bigyend,name_bigyend=sur_dictZ(df,bigyend)
+ bigq,name_bigq=sur_dictZ(df,bigq)
+ else:
+ raiseException("Error: df argument is required if bigy, bigX, bigyend and bigq are lists")
+ else:
+ raiseException("Error: bigy, bigX, bigyend and bigq must have the same number of elements")
+ else:
+ raiseException("Error: bigy, bigX, bigyend and bigq must be all lists or all dictionaries")
+
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.n_eq=len(bigy.keys())
+
+ # initialize names - should be generated by sur_stack
+ ifname_bigy:
+ self.name_bigy=name_bigy
+ else:# need to construct y names
+ self.name_bigy={}
+ forrinrange(self.n_eq):
+ yn="dep_var_"+str(r+1)
+ self.name_bigy[r]=yn
+
+ ifname_bigXisNone:
+ name_bigX={}
+ forrinrange(self.n_eq):
+ k=bigX[r].shape[1]-1
+ name_x=["var_"+str(i+1)+"_"+str(r+1)foriinrange(k)]
+ ct="Constant_"+str(r+1)# NOTE: constant always included in X
+ name_x.insert(0,ct)
+ name_bigX[r]=name_x
+
+ ifname_bigyendisNone:
+ name_bigyend={}
+ forrinrange(self.n_eq):
+ ky=bigyend[r].shape[1]
+ name_ye=["end_"+str(i+1)+"_"+str(r+1)foriinrange(ky)]
+ name_bigyend[r]=name_ye
+
+ ifname_bigqisNone:
+ name_bigq={}
+ forrinrange(self.n_eq):
+ ki=bigq[r].shape[1]
+ name_i=["inst_"+str(i+1)+"_"+str(r+1)foriinrange(ki)]
+ name_bigq[r]=name_i
+
+ ifregimesisnotNone:
+ self.constant_regi="many"
+ self.cols2regi="all"
+ self.regime_err_sep=False
+ self.name_regimes=USER.set_name_ds(name_regimes)
+ self.regimes_set=REGI._get_regimes_set(regimes)
+ self.regimes=regimes
+ cols2regi_dic={}
+ self.name_bigX,self.name_x_r,self.name_bigq,self.name_bigyend=(
+ {},
+ {},
+ {},
+ {},
+ )
+
+ forrinrange(self.n_eq):
+ self.name_x_r[r]=name_bigX[r]+name_bigyend[r]
+ cols2regi_dic[r]=REGI.check_cols2regi(
+ self.constant_regi,
+ self.cols2regi,
+ bigX[r],
+ yend=bigyend[r],
+ add_cons=False,
+ )
+ USER.check_regimes(self.regimes_set,bigy[0].shape[0],bigX[r].shape[1])
+ bigX[r],self.name_bigX[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigX[r],
+ regimes,
+ constant_regi=None,
+ cols2regi=cols2regi_dic[r],
+ names=name_bigX[r],
+ )
+ bigq[r],self.name_bigq[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigq[r],
+ regimes,
+ constant_regi=None,
+ cols2regi="all",
+ names=name_bigq[r],
+ )
+ bigyend[r],self.name_bigyend[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigyend[r],
+ regimes,
+ constant_regi=None,
+ cols2regi=cols2regi_dic[r],
+ yend=True,
+ names=name_bigyend[r],
+ )
+ else:
+ self.name_bigX,self.name_bigq,self.name_bigyend=(
+ name_bigX,
+ name_bigq,
+ name_bigyend,
+ )
+ # need checks on match between bigy, bigX dimensions
+ BaseThreeSLS.__init__(self,bigy=bigy,bigX=bigX,bigyend=bigyend,bigq=bigq)
+
+ # inference
+ self.tsls_inf=sur_setp(self.b3SLS,self.varb)
+
+ # test on constancy of coefficients across equations
+ ifcheck_k(self.bigK):# only for equal number of variables
+ self.surchow=sur_chow(self.n_eq,self.bigK,self.b3SLS,self.varb)
+ else:
+ self.surchow=None
+
+ # Listing of the results
+ self.title="THREE STAGE LEAST SQUARES (3SLS)"
+ ifregimesisnotNone:
+ self.title+=" - REGIMES"
+ self.chow_regimes={}
+ varb_counter=0
+ forrinrange(self.n_eq):
+ counter_end=varb_counter+self.b3SLS[r].shape[0]
+ self.chow_regimes[r]=REGI._chow_run(
+ len(cols2regi_dic[r]),
+ 0,
+ 0,
+ len(self.regimes_set),
+ self.b3SLS[r],
+ self.varb[varb_counter:counter_end,varb_counter:counter_end],
+ )
+ varb_counter=counter_end
+ regimes=True
+
+ SUMMARY.SUR(
+ reg=self,tsls=True,ml=False,nonspat_diag=nonspat_diag,regimes=regimes
+ )
+"""
+Spatial Error SUR estimation
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, \
+ Pedro V. Amaral pedrovma@gmail.com"
+
+
+importnumpyasnp
+importnumpy.linalgasla
+fromscipyimportstats
+
+stats.chisqprob=stats.chi2.sf
+from.importsummary_outputasSUMMARY
+from.importuser_outputasUSER
+from.importregimesasREGI
+fromscipy.sparse.linalgimportspluasSuperLU
+fromscipy.optimizeimportminimize_scalar,minimize
+fromscipyimportsparseassp
+
+from.ml_errorimporterr_c_loglik_sp
+from.utilsimportoptim_moments
+from.sur_utilsimport(
+ sur_dictxy,
+ sur_corr,
+ sur_dict2mat,
+ sur_crossprod,
+ sur_est,
+ sur_resids,
+ filter_dict,
+ check_k,
+)
+from.surimportBaseSUR,_sur_ols
+from.diagnostics_surimportsur_setp,lam_setp,sur_chow
+from.regimesimportbuildR,wald_test
+
+__all__=["BaseSURerrorGM","SURerrorGM","BaseSURerrorML","SURerrorML"]
+
+
+classBaseSURerrorGM:
+"""Base class for SUR Error estimation by Generalized Moments
+
+ Parameters
+ ----------
+ bigy : dictionary
+ with vector for dependent variable by equation
+ bigX : dictionary
+ with matrix of explanatory variables by equation
+ (note, already includes constant term)
+ w : spatial weights object
+
+ Attributes
+ ----------
+ n_eq : int
+ number of equations
+ n : int
+ number of observations in each cross-section
+ bigy : dictionary
+ with vectors of dependent variable, one for
+ each equation
+ bigX : dictionary
+ with matrices of explanatory variables,
+ one for each equation
+ bigK : array
+ n_eq x 1 array with number of explanatory variables
+ by equation
+ bigylag : dictionary
+ spatially lagged dependent variable
+ bigXlag : dictionary
+ spatially lagged explanatory variable
+ lamsur : float
+ spatial autoregressive coefficient in GM SUR Error
+ bSUR : array
+ beta coefficients in GM SUR Error
+ varb : array
+ variance of beta coefficients in GM SUR Error
+ sig : array
+ error variance-covariance matrix in GM SUR Error
+ corr : array
+ error correlation matrix
+ bigE : array
+ n by n_eq matrix of vectors of residuals for each equation
+
+ """
+
+ def__init__(self,bigy,bigX,w):
+ self.n=w.n
+ self.n_eq=len(bigy.keys())
+ WS=w.sparse
+ I=sp.identity(self.n)
+ # variables
+ self.bigy=bigy
+ self.bigX=bigX
+ # number of variables by equation
+ self.bigK=np.zeros((self.n_eq,1),dtype=np.int_)
+ forrinrange(self.n_eq):
+ self.bigK[r]=self.bigX[r].shape[1]
+
+ # OLS
+ self.bigXX,self.bigXy=sur_crossprod(self.bigX,self.bigy)
+ reg0=_sur_ols(self)
+
+ # Moments
+ moments=_momentsGM_sur_Error(WS,reg0.olsE)
+ lam=np.zeros((self.n_eq,1))
+ forrinrange(self.n_eq):
+ lam[r]=optim_moments(moments[r])
+
+ # spatially lagged variables
+ self.bigylag={}
+ forrinrange(self.n_eq):
+ self.bigylag[r]=WS*self.bigy[r]
+ # note: unlike WX as instruments, this includes the constant
+ self.bigXlag={}
+ forrinrange(self.n_eq):
+ self.bigXlag[r]=WS*self.bigX[r]
+
+ # spatially filtered variables
+ sply=filter_dict(lam,self.bigy,self.bigylag)
+ splX=filter_dict(lam,self.bigX,self.bigXlag)
+ WbigE=WS*reg0.olsE
+ splbigE=reg0.olsE-WbigE*lam.T
+ splXX,splXy=sur_crossprod(splX,sply)
+ b1,varb1,sig1=sur_est(splXX,splXy,splbigE,self.bigK)
+ bigE=sur_resids(self.bigy,self.bigX,b1)
+
+ self.lamsur=lam
+ self.bSUR=b1
+ self.varb=varb1
+ self.sig=sig1
+ self.corr=sur_corr(self.sig)
+ self.bigE=bigE
+
+
+def_momentsGM_sur_Error(w,u):
+ n=w.shape[0]
+ u2=(u*u).sum(0)
+ wu=w*u
+ uwu=(u*wu).sum(0)
+ wu2=(wu*wu).sum(0)
+ wwu=w*wu
+ uwwu=(u*wwu).sum(0)
+ wwu2=(wwu*wwu).sum(0)
+ wwuwu=(wwu*wu).sum(0)
+ trWtW=w.multiply(w).sum()
+ moments={}
+ forrinrange(u.shape[1]):
+ g=np.array([[u2[r],wu2[r],uwu[r]]]).T/n
+ G=(
+ np.array(
+ [
+ [2*uwu[r],-wu2[r],n],
+ [2*wwuwu[r],-wwu2[r],trWtW],
+ [uwwu[r]+wu2[r],-wwuwu[r],0.0],
+ ]
+ )
+ /n
+ )
+ moments[r]=[G,g]
+ returnmoments
+
+
+
+[docs]
+classSURerrorGM(BaseSURerrorGM,REGI.Regimes_Frame):
+"""
+ User class for SUR Error estimation by Generalized Moments
+
+ Parameters
+ ----------
+ bigy : list or dictionary
+ list with the name of the dependent variable for each equation
+ or dictionary with vectors for dependent variable by equation
+ bigX : list or dictionary
+ list of lists the name of the explanatory variables for each equation
+ or dictionary with matrix of explanatory variables by equation
+ (note, already includes constant term)
+ w : spatial weights object
+ db : Pandas DataFrame
+ Optional. Required in case bigy and bigX are lists with names of variables
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ nonspat_diag : boolean
+ flag for non-spatial diagnostics, default = False
+ spat_diag : boolean
+ flag for spatial diagnostics, default = False (to be implemented)
+ vm : boolean
+ flag for asymptotic variance for lambda and Sigma,
+ default = False (to be implemented)
+ name_bigy : dictionary
+ with name of dependent variable for each equation.
+ default = None, but should be specified is done when
+ sur_stackxy is used
+ name_bigX : dictionary
+ with names of explanatory variables for each equation.
+ default = None, but should be specified is done when
+ sur_stackxy is used
+ name_ds : string
+ name for the data set
+ name_w : string
+ name for the weights file
+ name_regimes : string
+ name of regime variable for use in the output
+
+ Attributes
+ ----------
+ n : int
+ number of observations in each cross-section
+ n_eq : int
+ number of equations
+ bigy : dictionary
+ with vectors of dependent variable, one for
+ each equation
+ bigX : dictionary
+ with matrices of explanatory variables,
+ one for each equation
+ bigK : array
+ n_eq x 1 array with number of explanatory variables
+ by equation
+ bigylag : dictionary
+ spatially lagged dependent variable
+ bigXlag : dictionary
+ spatially lagged explanatory variable
+ lamsur : float
+ spatial autoregressive coefficient in ML SUR Error
+ bSUR : array
+ beta coefficients in ML SUR Error
+ varb : array
+ variance of beta coefficients in ML SUR Error
+ sig : array
+ error variance-covariance matrix in ML SUR Error
+ bigE : array
+ n by n_eq matrix of vectors of residuals for each equation
+ sur_inf : array
+ inference for regression coefficients, stand. error, t, p
+ surchow : array
+ list with tuples for Chow test on regression coefficients.
+ each tuple contains test value, degrees of freedom, p-value
+ name_bigy : dictionary
+ with name of dependent variable for each equation
+ name_bigX : dictionary
+ with names of explanatory variables for each
+ equation
+ name_ds : string
+ name for the data set
+ name_w : string
+ name for the weights file
+ name_regimes : string
+ name of regime variable for use in the output
+
+
+ Examples
+ --------
+
+ >>> import libpysal
+ >>> import geopandas as gpd
+ >>> from spreg import SURerrorGM
+
+ Open data on NCOVR US County Homicides (3085 areas) from libpysal examples using geopandas.
+
+ >>> nat = libpysal.examples.load_example('Natregimes')
+ >>> df = gpd.read_file(nat.get_path("natregimes.shp"))
+
+ The specification of the model to be estimated can be provided as lists.
+ Each equation should be listed separately. In this example, equation 1
+ has HR80 as dependent variable and PS80 and UE80 as exogenous regressors.
+ For equation 2, HR90 is the dependent variable, and PS90 and UE90 the
+ exogenous regressors.
+
+ >>> y_var = ['HR80','HR90']
+ >>> x_var = [['PS80','UE80'],['PS90','UE90']]
+
+ To run a spatial error model, we need to specify the spatial weights matrix.
+ To do that, we can open an already existing gal file or create a new one.
+ In this example, we will create a new one from NAT.shp and transform it to
+ row-standardized.
+
+ >>> w = libpysal.weights.Queen.from_dataframe(df)
+ >>> w.transform='r'
+
+ We can now run the regression and then have a summary of the output by typing:
+ print(reg.summary)
+
+ Alternatively, we can just check the betas and standard errors, asymptotic t
+ and p-value of the parameters:
+
+ >>> reg = SURerrorGM(y_var,x_var,w=w,df=df,name_ds="NAT",name_w="nat_queen")
+ >>> reg.bSUR
+ {0: array([[3.97746866],
+ [0.89021219],
+ [0.43050363]]), 1: array([[2.93679119],
+ [1.11002826],
+ [0.48761542]])}
+ >>> reg.sur_inf
+ {0: array([[ 0.37251476, 10.67734504, 0. ],
+ [ 0.14224297, 6.25839153, 0. ],
+ [ 0.04322388, 9.95985608, 0. ]]), 1: array([[ 0.33694902, 8.71583245, 0. ],
+ [ 0.13413626, 8.27537783, 0. ],
+ [ 0.04033105, 12.09032288, 0. ]])}
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ bigy,
+ bigX,
+ w,
+ df=None,
+ regimes=None,
+ nonspat_diag=True,
+ spat_diag=False,
+ vm=False,
+ name_bigy=None,
+ name_bigX=None,
+ name_ds=None,
+ name_w=None,
+ name_regimes=None,
+ ):
+
+ ifisinstance(bigy,list)orisinstance(bigX,list):
+ ifisinstance(bigy,list)andisinstance(bigX,list):
+ iflen(bigy)==len(bigX):
+ ifdfisnotNone:
+ bigy,bigX,name_bigy,name_bigX=sur_dictxy(df,bigy,bigX)
+ else:
+ raiseException("Error: df argument is required if bigy and bigX are lists")
+ else:
+ raiseException("Error: bigy and bigX must have the same number of elements")
+ else:
+ raiseException("Error: bigy and bigX must be both lists or both dictionaries")
+
+ # check on variable names for listing results
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.name_w=USER.set_name_w(name_w,w)
+ # initialize names - should be generated by sur_stack
+ self.n_eq=len(bigy.keys())
+ ifname_bigy:
+ self.name_bigy=name_bigy
+ else:# need to construct y names
+ self.name_bigy={}
+ forrinrange(self.n_eq):
+ yn="dep_var_"+str(r)
+ self.name_bigy[r]=yn
+ ifname_bigXisNone:
+ name_bigX={}
+ forrinrange(self.n_eq):
+ k=bigX[r].shape[1]-1
+ name_x=["var_"+str(i+1)+"_"+str(r+1)foriinrange(k)]
+ ct="Constant_"+str(r+1)# NOTE: constant always included in X
+ name_x.insert(0,ct)
+ name_bigX[r]=name_x
+
+ ifregimesisnotNone:
+ self.constant_regi="many"
+ self.cols2regi="all"
+ self.regime_err_sep=False
+ self.name_regimes=USER.set_name_ds(name_regimes)
+ self.regimes_set=REGI._get_regimes_set(regimes)
+ self.regimes=regimes
+ cols2regi_dic={}
+ self.name_bigX={}
+ self.name_x_r=name_bigX
+
+ forrinrange(self.n_eq):
+ cols2regi_dic[r]=REGI.check_cols2regi(
+ self.constant_regi,self.cols2regi,bigX[r],add_cons=False
+ )
+ USER.check_regimes(self.regimes_set,bigy[0].shape[0],bigX[r].shape[1])
+ bigX[r],self.name_bigX[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigX[r],
+ regimes,
+ constant_regi=None,
+ cols2regi=cols2regi_dic[r],
+ names=name_bigX[r],
+ )
+ else:
+ self.name_bigX=name_bigX
+
+ BaseSURerrorGM.__init__(self,bigy=bigy,bigX=bigX,w=w)
+
+ # inference
+ self.sur_inf=sur_setp(self.bSUR,self.varb)
+
+ # test on constancy of regression coefficients across equations
+ ifcheck_k(self.bigK):# only for equal number of variables
+ self.surchow=sur_chow(self.n_eq,self.bigK,self.bSUR,self.varb)
+ else:
+ self.surchow=None
+
+ # listing of results
+ self.title="SEEMINGLY UNRELATED REGRESSIONS (SUR) - GM SPATIAL ERROR MODEL"
+
+ ifregimesisnotNone:
+ self.title="SUR - GM SPATIAL ERROR MODEL WITH REGIMES"
+ self.chow_regimes={}
+ varb_counter=0
+ forrinrange(self.n_eq):
+ counter_end=varb_counter+self.bSUR[r].shape[0]
+ self.chow_regimes[r]=REGI._chow_run(
+ len(cols2regi_dic[r]),
+ 0,
+ 0,
+ len(self.regimes_set),
+ self.bSUR[r],
+ self.varb[varb_counter:counter_end,varb_counter:counter_end],
+ )
+ varb_counter=counter_end
+ regimes=True
+
+ SUMMARY.SUR(
+ reg=self,
+ nonspat_diag=nonspat_diag,
+ spat_diag=spat_diag,
+ lambd=True,
+ ml=False,
+ regimes=regimes,
+ )
+
+
+
+
+classBaseSURerrorML:
+"""
+ Base class for SUR Error estimation by Maximum Likelihood
+
+ requires: scipy.optimize.minimize_scalar and scipy.optimize.minimize
+
+ Parameters
+ ----------
+ bigy : dictionary
+ with vectors of dependent variable, one for
+ each equation
+ bigX : dictionary
+ with matrices of explanatory variables,
+ one for each equation
+ w : spatial weights object
+ epsilon : float
+ convergence criterion for ML iterations
+ default 0.0000001
+
+ Attributes
+ ----------
+ n : int
+ number of observations in each cross-section
+ n2 : int
+ n/2
+ n_eq : int
+ number of equations
+ bigy : dictionary
+ with vectors of dependent variable, one for
+ each equation
+ bigX : dictionary
+ with matrices of explanatory variables,
+ one for each equation
+ bigK : array
+ n_eq x 1 array with number of explanatory variables
+ by equation
+ bigylag : dictionary
+ spatially lagged dependent variable
+ bigXlag : dictionary
+ spatially lagged explanatory variable
+ lamols : array
+ spatial autoregressive coefficients from equation by
+ equation ML-Error estimation
+ clikerr : float
+ concentrated log-likelihood from equation by equation
+ ML-Error estimation (no constant)
+ bSUR0 : array
+ SUR estimation for betas without spatial autocorrelation
+ llik : float
+ log-likelihood for classic SUR estimation (includes constant)
+ lamsur : float
+ spatial autoregressive coefficient in ML SUR Error
+ bSUR : array
+ beta coefficients in ML SUR Error
+ varb : array
+ variance of beta coefficients in ML SUR Error
+ sig : array
+ error variance-covariance matrix in ML SUR Error
+ corr : array
+ error correlation matrix
+ bigE : array
+ n by n_eq matrix of vectors of residuals for each equation
+ cliksurerr : float
+ concentrated log-likelihood from ML SUR Error (no constant)
+
+ """
+
+ def__init__(self,bigy,bigX,w,epsilon=0.0000001):
+ # setting up constants
+ self.n=w.n
+ self.n2=self.n/2.0
+ self.n_eq=len(bigy.keys())
+ WS=w.sparse
+ I=sp.identity(self.n)
+ # variables
+ self.bigy=bigy
+ self.bigX=bigX
+ # number of variables by equation
+ self.bigK=np.zeros((self.n_eq,1),dtype=np.int_)
+ forrinrange(self.n_eq):
+ self.bigK[r]=self.bigX[r].shape[1]
+ # spatially lagged variables
+ self.bigylag={}
+ forrinrange(self.n_eq):
+ self.bigylag[r]=WS*self.bigy[r]
+ # note: unlike WX as instruments, this includes the constant
+ self.bigXlag={}
+ forrinrange(self.n_eq):
+ self.bigXlag[r]=WS*self.bigX[r]
+
+ # spatial parameter starting values
+ lam=np.zeros((self.n_eq,1))# initialize as an array
+ fun0=0.0
+ fun1=0.0
+ forrinrange(self.n_eq):
+ res=minimize_scalar(
+ err_c_loglik_sp,
+ 0.0,
+ bounds=(-1.0,1.0),
+ args=(
+ self.n,
+ self.bigy[r],
+ self.bigylag[r],
+ self.bigX[r],
+ self.bigXlag[r],
+ I,
+ WS,
+ ),
+ method="bounded",
+ options={"xatol":epsilon},
+ )
+ lam[r]=res.x
+ fun1+=res.fun
+ self.lamols=lam
+ self.clikerr=-fun1# negative because use in min
+
+ # SUR starting values
+ reg0=BaseSUR(self.bigy,self.bigX,iter=True)
+ bigE=reg0.bigE
+ self.bSUR0=reg0.bSUR
+ self.llik=reg0.llik# as is, includes constant
+
+ # iteration
+ lambdabounds=[(-1.0,+1.0)foriinrange(self.n_eq)]
+ whileabs(fun0-fun1)>epsilon:
+ fun0=fun1
+ sply=filter_dict(lam,self.bigy,self.bigylag)
+ splX=filter_dict(lam,self.bigX,self.bigXlag)
+ WbigE=WS*bigE
+ splbigE=bigE-WbigE*lam.T
+ splXX,splXy=sur_crossprod(splX,sply)
+ b1,varb1,sig1=sur_est(splXX,splXy,splbigE,self.bigK)
+ bigE=sur_resids(self.bigy,self.bigX,b1)
+ res=minimize(
+ clik,
+ np.array(lam).flatten(),
+ args=(self.n,self.n2,self.n_eq,bigE,I,WS),
+ method="L-BFGS-B",
+ bounds=lambdabounds,
+ )
+ lam=res.x
+ lam.resize((self.n_eq,1))
+ fun1=res.fun
+ self.lamsur=lam
+ self.bSUR=b1
+ self.varb=varb1
+ self.sig=sig1
+ self.corr=sur_corr(self.sig)
+ self.bigE=bigE
+ self.cliksurerr=-fun1# negative because use in min, no constant
+
+
+
+[docs]
+classSURerrorML(BaseSURerrorML,REGI.Regimes_Frame):
+"""
+ User class for SUR Error estimation by Maximum Likelihood
+
+ Parameters
+ ----------
+ bigy : list or dictionary
+ list with the name of the dependent variable for each equation
+ or dictionary with vectors for dependent variable by equation
+ bigX : list or dictionary
+ list of lists the name of the explanatory variables for each equation
+ or dictionary with matrix of explanatory variables by equation
+ (note, already includes constant term)
+ w : spatial weights object
+ db : Pandas DataFrame
+ Optional. Required in case bigy and bigX are lists with names of variables
+ regimes : list
+ default = None.
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ epsilon : float
+ convergence criterion for ML iterations.
+ default 0.0000001
+ nonspat_diag : boolean
+ flag for non-spatial diagnostics, default = True
+ spat_diag : boolean
+ flag for spatial diagnostics, default = False
+ vm : boolean
+ flag for asymptotic variance for lambda and Sigma,
+ default = False
+ name_bigy : dictionary
+ with name of dependent variable for each equation.
+ default = None, but should be specified is done when
+ sur_stackxy is used
+ name_bigX : dictionary
+ with names of explanatory variables for each equation.
+ default = None, but should be specified is done when
+ sur_stackxy is used
+ name_ds : string
+ name for the data set
+ name_w : string
+ name for the weights file
+ name_regimes : string
+ name of regime variable for use in the output
+
+ Attributes
+ ----------
+ n : int
+ number of observations in each cross-section
+ n2 : int
+ n/2
+ n_eq : int
+ number of equations
+ bigy : dictionary
+ with vectors of dependent variable, one for
+ each equation
+ bigX : dictionary
+ with matrices of explanatory variables,
+ one for each equation
+ bigK : array
+ n_eq x 1 array with number of explanatory variables
+ by equation
+ bigylag : dictionary
+ spatially lagged dependent variable
+ bigXlag : dictionary
+ spatially lagged explanatory variable
+ lamols : array
+ spatial autoregressive coefficients from equation by
+ equation ML-Error estimation
+ clikerr : float
+ concentrated log-likelihood from equation by equation
+ ML-Error estimation (no constant)
+ bSUR0 : array
+ SUR estimation for betas without spatial autocorrelation
+ llik : float
+ log-likelihood for classic SUR estimation (includes constant)
+ lamsur : float
+ spatial autoregressive coefficient in ML SUR Error
+ bSUR : array
+ beta coefficients in ML SUR Error
+ varb : array
+ variance of beta coefficients in ML SUR Error
+ sig : array
+ error variance-covariance matrix in ML SUR Error
+ bigE : array
+ n by n_eq matrix of vectors of residuals for each equation
+ cliksurerr : float
+ concentrated log-likelihood from ML SUR Error (no constant)
+ sur_inf : array
+ inference for regression coefficients, stand. error, t, p
+ errllik : float
+ log-likelihood for error model without SUR (with constant)
+ surerrllik : float
+ log-likelihood for SUR error model (with constant)
+ lrtest : tuple
+ likelihood ratio test for off-diagonal Sigma elements
+ likrlambda : tuple
+ likelihood ratio test on spatial autoregressive coefficients
+ vm : array
+ asymptotic variance matrix for lambda and Sigma (only for vm=True)
+ lamsetp : array
+ inference for lambda, stand. error, t, p (only for vm=True)
+ lamtest : tuple
+ with test for constancy of lambda across equations
+ (test value, degrees of freedom, p-value)
+ joinlam : tuple
+ with test for joint significance of lambda across
+ equations (test value, degrees of freedom, p-value)
+ surchow : list
+ with tuples for Chow test on regression coefficients.
+ each tuple contains test value, degrees of freedom, p-value
+ name_bigy : dictionary
+ with name of dependent variable for each equation
+ name_bigX : dictionary
+ with names of explanatory variables for each
+ equation
+ name_ds : string
+ name for the data set
+ name_w : string
+ name for the weights file
+ name_regimes : string
+ name of regime variable for use in the output
+
+
+ Examples
+ --------
+
+ >>> import libpysal
+ >>> import geopandas as gpd
+ >>> from spreg import SURerrorML
+
+ Open data on NCOVR US County Homicides (3085 areas) from libpysal examples using geopandas.
+
+ >>> nat = libpysal.examples.load_example('Natregimes')
+ >>> df = gpd.read_file(nat.get_path("natregimes.shp"))
+
+ The specification of the model to be estimated can be provided as lists.
+ Each equation should be listed separately. In this example, equation 1
+ has HR80 as dependent variable and PS80 and UE80 as exogenous regressors.
+ For equation 2, HR90 is the dependent variable, and PS90 and UE90 the
+ exogenous regressors.
+
+ >>> y_var = ['HR80','HR90']
+ >>> x_var = [['PS80','UE80'],['PS90','UE90']]
+
+ To run a spatial error model, we need to specify the spatial weights matrix.
+ To do that, we can open an already existing gal file or create a new one.
+ In this example, we will create a new one from NAT.shp and transform it to
+ row-standardized.
+
+ >>> w = libpysal.weights.Queen.from_dataframe(df)
+ >>> w.transform='r'
+
+ We can now run the regression and then have a summary of the output by typing:
+ print(reg.summary)
+
+ Alternatively, we can just check the betas and standard errors, asymptotic t
+ and p-value of the parameters:
+
+ >>> reg = spreg.SURerrorML(y_var,x_var,w=w,df=df,name_ds="NAT",name_w="nat_queen")
+ >>> reg.bSUR
+ {0: array([[4.02228606],
+ [0.88489637],
+ [0.42402845]]), 1: array([[3.04923031],
+ [1.10972632],
+ [0.47075678]])}
+
+ >>> reg.sur_inf
+ {0: array([[ 0.36692175, 10.96224484, 0. ],
+ [ 0.14129077, 6.26294545, 0. ],
+ [ 0.04267954, 9.93516909, 0. ]]), 1: array([[ 0.33139967, 9.20106629, 0. ],
+ [ 0.13352591, 8.31094381, 0. ],
+ [ 0.04004097, 11.75687747, 0. ]])}
+
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ bigy,
+ bigX,
+ w,
+ df=None,
+ regimes=None,
+ nonspat_diag=True,
+ spat_diag=False,
+ vm=False,
+ epsilon=0.0000001,
+ name_bigy=None,
+ name_bigX=None,
+ name_ds=None,
+ name_w=None,
+ name_regimes=None,
+ ):
+
+ ifisinstance(bigy,list)orisinstance(bigX,list):
+ ifisinstance(bigy,list)andisinstance(bigX,list):
+ iflen(bigy)==len(bigX):
+ ifdfisnotNone:
+ bigy,bigX,name_bigy,name_bigX=sur_dictxy(df,bigy,bigX)
+ else:
+ raiseException("Error: df argument is required if bigy and bigX are lists")
+ else:
+ raiseException("Error: bigy and bigX must have the same number of elements")
+ else:
+ raiseException("Error: bigy and bigX must be both lists or both dictionaries")
+
+ # check on variable names for listing results
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.name_w=USER.set_name_w(name_w,w)
+ self.n_eq=len(bigy.keys())
+ # initialize names - should be generated by sur_stack
+ ifname_bigy:
+ self.name_bigy=name_bigy
+ else:# need to construct y names
+ self.name_bigy={}
+ forrinrange(self.n_eq):
+ yn="dep_var_"+str(r)
+ self.name_bigy[r]=yn
+ ifname_bigXisNone:
+ name_bigX={}
+ forrinrange(self.n_eq):
+ k=bigX[r].shape[1]-1
+ name_x=["var_"+str(i+1)+"_"+str(r+1)foriinrange(k)]
+ ct="Constant_"+str(r+1)# NOTE: constant always included in X
+ name_x.insert(0,ct)
+ name_bigX[r]=name_x
+
+ ifregimesisnotNone:
+ self.constant_regi="many"
+ self.cols2regi="all"
+ self.regime_err_sep=False
+ self.name_regimes=USER.set_name_ds(name_regimes)
+ self.regimes_set=REGI._get_regimes_set(regimes)
+ self.regimes=regimes
+ self.name_x_r=name_bigX
+ cols2regi_dic={}
+ self.name_bigX={}
+ forrinrange(self.n_eq):
+ cols2regi_dic[r]=REGI.check_cols2regi(
+ self.constant_regi,self.cols2regi,bigX[r],add_cons=False
+ )
+ USER.check_regimes(self.regimes_set,bigy[0].shape[0],bigX[r].shape[1])
+ bigX[r],self.name_bigX[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigX[r],
+ regimes,
+ constant_regi=None,
+ cols2regi=cols2regi_dic[r],
+ names=name_bigX[r],
+ )
+ else:
+ self.name_bigX=name_bigX
+
+ # moved init here
+ BaseSURerrorML.__init__(self,bigy=bigy,bigX=bigX,w=w,epsilon=epsilon)
+
+ # inference
+ self.sur_inf=sur_setp(self.bSUR,self.varb)
+
+ # adjust concentrated log lik for constant
+ const=-self.n2*(self.n_eq*(1.0+np.log(2.0*np.pi)))
+ self.errllik=const+self.clikerr
+ self.surerrllik=const+self.cliksurerr
+
+ # LR test on off-diagonal sigma
+ ifnonspat_diag:
+ M=self.n_eq*(self.n_eq-1)/2.0
+ likrodiag=2.0*(self.surerrllik-self.errllik)
+ plik1=stats.chisqprob(likrodiag,M)
+ self.lrtest=(likrodiag,int(M),plik1)
+ else:
+ self.lrtest=None
+
+ # LR test on spatial autoregressive coefficients
+ ifspat_diag:
+ liklambda=2.0*(self.surerrllik-self.llik)
+ plik2=stats.chisqprob(liklambda,self.n_eq)
+ self.likrlambda=(liklambda,self.n_eq,plik2)
+ else:
+ self.likrlambda=None
+
+ # asymptotic variance for spatial coefficient
+ ifvm:
+ self.vm=surerrvm(self.n,self.n_eq,w,self.lamsur,self.sig)
+ vlam=self.vm[:self.n_eq,:self.n_eq]
+ self.lamsetp=lam_setp(self.lamsur,vlam)
+ # test on constancy of lambdas
+ R=REGI.buildR(kr=1,kf=0,nr=self.n_eq)
+ w,p=REGI.wald_test(self.lamsur,R,np.zeros((R.shape[0],1)),vlam)
+ self.lamtest=(w,R.shape[0],p)
+ ifspat_diag:# test on joint significance of lambdas
+ Rj=np.identity(self.n_eq)
+ wj,pj=REGI.wald_test(
+ self.lamsur,Rj,np.zeros((Rj.shape[0],1)),vlam
+ )
+ self.joinlam=(wj,Rj.shape[0],pj)
+ else:
+ self.joinlam=None
+ else:
+ self.vm=None
+ self.lamsetp=None
+ self.lamtest=None
+ self.joinlam=None
+
+ # test on constancy of regression coefficients across equations
+ ifcheck_k(self.bigK):# only for equal number of variables
+ self.surchow=sur_chow(self.n_eq,self.bigK,self.bSUR,self.varb)
+ else:
+ self.surchow=None
+
+ # listing of results
+ self.title="SEEMINGLY UNRELATED REGRESSIONS (SUR) - ML SPATIAL ERROR MODEL"
+ ifregimesisnotNone:
+ self.title="SUR - ML SPATIAL ERROR MODEL - REGIMES"
+ self.chow_regimes={}
+ varb_counter=0
+ forrinrange(self.n_eq):
+ counter_end=varb_counter+self.bSUR[r].shape[0]
+ self.chow_regimes[r]=REGI._chow_run(
+ len(cols2regi_dic[r]),
+ 0,
+ 0,
+ len(self.regimes_set),
+ self.bSUR[r],
+ self.varb[varb_counter:counter_end,varb_counter:counter_end],
+ )
+ varb_counter=counter_end
+ regimes=True
+
+ SUMMARY.SUR(
+ reg=self,
+ nonspat_diag=nonspat_diag,
+ spat_diag=spat_diag,
+ lambd=True,
+ regimes=regimes,
+ )
+
+
+
+
+defjacob(lam,n_eq,I,WS):
+"""Log-Jacobian for SUR Error model
+
+ Parameters
+ ----------
+ lam : array
+ n_eq by 1 array of spatial autoregressive parameters
+ n_eq : int
+ number of equations
+ I : sparse matrix
+ sparse Identity matrix
+ WS : sparse matrix
+ sparse spatial weights matrix
+
+ Returns
+ -------
+ logjac : float
+ the log Jacobian
+
+ """
+ logjac=0.0
+ forrinrange(n_eq):
+ lami=lam[r]
+ lamWS=WS.multiply(lami)
+ B=(I-lamWS).tocsc()
+ LU=SuperLU(B)
+ jj=np.sum(np.log(np.abs(LU.U.diagonal())))
+ logjac+=jj
+ returnlogjac
+
+
+defclik(lam,n,n2,n_eq,bigE,I,WS):
+"""
+ Concentrated (negative) log-likelihood for SUR Error model
+
+ Parameters
+ ----------
+ lam : array
+ n_eq x 1 array of spatial autoregressive parameters
+ n : int
+ number of observations in each cross-section
+ n2 : int
+ n/2
+ n_eq : int
+ number of equations
+ bigE : array
+ n by n_eq matrix with vectors of residuals for
+ each equation
+ I : sparse Identity matrix
+ WS : sparse spatial weights matrix
+
+ Returns
+ -------
+ -clik : float
+ negative (for minimize) of the concentrated
+ log-likelihood function
+
+ """
+ WbigE=WS*bigE
+ spfbigE=bigE-WbigE*lam.T
+ sig=np.dot(spfbigE.T,spfbigE)/n
+ ldet=la.slogdet(sig)[1]
+ logjac=jacob(lam,n_eq,I,WS)
+ clik=-n2*ldet+logjac
+ return-clik# negative for minimize
+
+
+defsurerrvm(n,n_eq,w,lam,sig):
+"""
+ Asymptotic variance matrix for lambda and Sigma in
+ ML SUR Error estimation
+
+ Source: Anselin (1988) :cite:`Anselin1988`, Chapter 10.
+
+ Parameters
+ ----------
+ n : int
+ number of cross-sectional observations
+ n_eq : int
+ number of equations
+ w : spatial weights object
+ lam : array
+ n_eq by 1 vector with spatial autoregressive coefficients
+ sig : array
+ n_eq by n_eq matrix with cross-equation error covariances
+
+ Returns
+ -------
+ vm : array
+ asymptotic variance-covariance matrix for spatial autoregressive
+ coefficients and the upper triangular elements of Sigma
+ n_eq + n_eq x (n_eq + 1) / 2 coefficients
+
+
+ """
+ # inverse Sigma
+ sigi=la.inv(sig)
+ sisi=sigi*sig
+ # elements of Psi_lam,lam
+ # trace terms
+ trDi=np.zeros((n_eq,1))
+ trDDi=np.zeros((n_eq,1))
+ trDTDi=np.zeros((n_eq,1))
+ trDTiDj=np.zeros((n_eq,n_eq))
+ WS=w.sparse
+ I=sp.identity(n)
+ foriinrange(n_eq):
+ lami=lam[i][0]
+ lamWS=WS.multiply(lami)
+ B=I-lamWS
+ bb=B.todense()
+ Bi=la.inv(bb)
+ D=WS*Bi
+ trDi[i]=np.trace(D)
+ DD=np.dot(D,D)
+ trDDi[i]=np.trace(DD)
+ DD=np.dot(D.T,D)
+ trDTDi[i]=np.trace(DD)
+ forjinrange(i+1,n_eq):
+ lamj=lam[j][0]
+ lamWS=WS.multiply(lamj)
+ B=I-lamWS
+ bb=B.todense()
+ Bi=la.inv(bb)
+ Dj=WS*Bi
+ DD=np.dot(D.T,Dj)
+ trDTiDj[i,j]=np.trace(DD)
+ trDTiDj[j,i]=trDTiDj[i,j]
+ np.fill_diagonal(trDTiDj,trDTDi)
+
+ sisjT=sisi*trDTiDj
+ Vll=np.diagflat(trDDi)+sisjT
+
+ # elements of Psi_lam_sig
+ P=int(n_eq*(n_eq+1)/2)# force ints to be ints
+ tlist=[(i,j)foriinrange(n_eq)forjinrange(i,n_eq)]
+ zog=sigi*trDi
+ Vlsig=np.zeros((n_eq,P))
+ foriinrange(n_eq):
+ forjinrange(n_eq):
+ ifi>j:
+ jj=tlist.index((j,i))
+ else:
+ jj=tlist.index((i,j))
+ Vlsig[i,jj]=zog[i,j]
+
+ # top of Psi
+ vtop=np.hstack((Vll,Vlsig))
+
+ # elements of Psi_sig_sig
+
+ Vsig=np.zeros((P,P))
+ forijinrange(P):
+ i,j=tlist[ij]
+ forhkinrange(P):
+ h,k=tlist[hk]
+ ifi==j:
+ ifh==k:
+ Vsig[ij,hk]=0.5*(sigi[i,h]**2)
+ else:# h not equal to k
+ Vsig[ij,hk]=sigi[i,h]*sigi[i,k]
+ else:# i not equal to j
+ ifh==k:
+ Vsig[ij,hk]=sigi[i,h]*sigi[j,h]
+ else:# h not equal to k
+ Vsig[ij,hk]=sigi[i,h]*sigi[j,k]+sigi[i,k]*sigi[j,h]
+ Vsig=n*Vsig
+
+ # bottom of Psi
+ vbottom=np.hstack((Vlsig.T,Vsig))
+
+ # all of Psi
+ vbig=np.vstack((vtop,vbottom))
+
+ # inverse of Psi
+ vm=la.inv(vbig)
+
+ returnvm
+
+
+def_test():
+ importdoctest
+
+ start_suppress=np.get_printoptions()["suppress"]
+ np.set_printoptions(suppress=True)
+ doctest.testmod()
+ np.set_printoptions(suppress=start_suppress)
+
+
+if__name__=="__main__":
+ _test()
+ importnumpyasnp
+ importlibpysal
+ from.sur_utilsimportsur_dictxy,sur_dictZ
+ fromlibpysal.examplesimportload_example
+ fromlibpysal.weightsimportQueen
+
+ nat=load_example("Natregimes")
+ db=libpysal.io.open(nat.get_path("natregimes.dbf"),"r")
+ y_var=["HR80","HR90"]
+ x_var=[["PS80","UE80"],["PS90","UE90"]]
+ w=Queen.from_shapefile(nat.get_path("natregimes.shp"))
+ w.transform="r"
+ bigy0,bigX0,bigyvars0,bigXvars0=sur_dictxy(db,y_var,x_var)
+ reg0=SURerrorML(
+ bigy0,
+ bigX0,
+ w,
+ #regimes=regimes,
+ name_bigy=bigyvars0,
+ name_bigX=bigXvars0,
+ name_w="natqueen",
+ name_ds="natregimes",
+ vm=True,
+ nonspat_diag=True,
+ spat_diag=True,
+ )
+
+ # reg0 = SURerrorGM(bigy0,bigX0,w,regimes=regimes,name_bigy=bigyvars0,name_bigX=bigXvars0,\
+ # name_w="natqueen",name_ds="natregimes",vm=False,nonspat_diag=True,spat_diag=False)
+
+ print(reg0.summary)
+
+"""
+Spatial Lag SUR estimation
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, \
+ Pedro V. Amaral pedrovma@gmail.com"
+
+
+importnumpyasnp
+from.importsummary_outputasSUMMARY
+from.importuser_outputasUSER
+from.importregimesasREGI
+from.surimportBaseThreeSLS
+from.diagnostics_surimportsur_setp,sur_chow,sur_joinrho
+from.sur_utilsimportcheck_k,sur_dictxy,sur_dictZ
+
+__all__=["SURlagIV"]
+
+
+
+[docs]
+classSURlagIV(BaseThreeSLS,REGI.Regimes_Frame):
+"""
+ User class for spatial lag estimation using IV
+
+ Parameters
+ ----------
+ bigy : list or dictionary
+ list with the names of the dependent variable for each equation
+ or dictionary with vectors for dependent variable by equation
+ bigX : list or dictionary
+ list of lists the names of the explanatory variables for each equation
+ or dictionary with matrix of explanatory variables by equation
+ (note, already includes constant term)
+ bigyend : list or dictionary
+ list of lists the names of the endogenous variables for each equation
+ or dictionary with matrix of endogenous variables by equation
+ bigq : list or dictionary
+ list of lists the names of the instrument variables for each equation
+ or dictionary with matrix of instruments by equation
+ w : spatial weights object, required
+ db : Pandas DataFrame
+ Optional. Required in case bigy and bigX are lists with names of variables
+ vm : boolean
+ listing of full variance-covariance matrix, default = False
+ w_lags : integer
+ order of spatial lags for WX instruments, default = 1
+ lag_q : boolean
+ flag to apply spatial lag to other instruments,
+ default = True
+ nonspat_diag : boolean
+ flag for non-spatial diagnostics, default = True
+ spat_diag : boolean
+ flag for spatial diagnostics, default = False
+ name_bigy : dictionary
+ with name of dependent variable for each equation.
+ default = None, but should be specified.
+ is done when sur_stackxy is used.
+ name_bigX : dictionary
+ with names of explanatory variables for each
+ equation.
+ default = None, but should be specified.
+ is done when sur_stackxy is used.
+ name_bigyend : dictionary
+ with names of endogenous variables for each
+ equation.
+ default = None, but should be specified.
+ is done when sur_stackZ is used.
+ name_bigq : dictionary
+ with names of instrumental variables for each
+ equations.
+ default = None, but should be specified.
+ is done when sur_stackZ is used.
+ name_ds : string
+ name for the data set
+ name_w : string
+ name for the spatial weights
+
+ Attributes
+ ----------
+ w : spatial weights object
+ bigy : dictionary
+ with y values
+ bigZ : dictionary
+ with matrix of exogenous and endogenous variables
+ for each equation
+ bigyend : dictionary
+ with matrix of endogenous variables for each
+ equation; contains Wy only if no other endogenous specified
+ bigq : dictionary
+ with matrix of instrumental variables for each
+ equation; contains WX only if no other endogenous specified
+ bigZHZH : dictionary
+ with matrix of cross products Zhat_r'Zhat_s
+ bigZHy : dictionary
+ with matrix of cross products Zhat_r'y_end_s
+ n_eq : int
+ number of equations
+ n : int
+ number of observations in each cross-section
+ bigK : array
+ vector with number of explanatory variables (including constant,
+ exogenous and endogenous) for each equation
+ b2SLS : dictionary
+ with 2SLS regression coefficients for each equation
+ tslsE : array
+ N x n_eq array with OLS residuals for each equation
+ b3SLS : dictionary
+ with 3SLS regression coefficients for each equation
+ varb : array
+ variance-covariance matrix
+ sig : array
+ Sigma matrix of inter-equation error covariances
+ resids : array
+ n by n_eq array of residuals
+ corr : array
+ inter-equation 3SLS error correlation matrix
+ tsls_inf : dictionary
+ with standard error, asymptotic t and p-value,
+ one for each equation
+ joinrho : tuple
+ test on joint significance of spatial autoregressive coefficient.
+ tuple with test statistic, degrees of freedom, p-value
+ surchow : array
+ list with tuples for Chow test on regression coefficients
+ each tuple contains test value, degrees of freedom, p-value
+ name_w : string
+ name for the spatial weights
+ name_ds : string
+ name for the data set
+ name_bigy : dictionary
+ with name of dependent variable for each equation
+ name_bigX : dictionary
+ with names of explanatory variables for each
+ equation
+ name_bigyend : dictionary
+ with names of endogenous variables for each
+ equation
+ name_bigq : dictionary
+ with names of instrumental variables for each
+ equations
+
+
+ Examples
+ --------
+ >>> import libpysal
+ >>> import geopandas as gpd
+ >>> from spreg import SURlagIV
+ >>> import numpy as np
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+
+ Open data on NCOVR US County Homicides (3085 areas) from libpysal examples using geopandas.
+
+ >>> nat = libpysal.examples.load_example('Natregimes')
+ >>> df = gpd.read_file(nat.get_path("natregimes.shp"))
+
+ The specification of the model to be estimated can be provided as lists.
+ Each equation should be listed separately. In this example, equation 1
+ has HR80 as dependent variable, PS80 and UE80 as exogenous regressors,
+ RD80 as endogenous regressor and FP79 as additional instrument.
+ For equation 2, HR90 is the dependent variable, PS90 and UE90 the
+ exogenous regressors, RD90 as endogenous regressor and FP99 as
+ additional instrument
+
+ >>> y_var = ['HR80','HR90']
+ >>> x_var = [['PS80','UE80'],['PS90','UE90']]
+ >>> yend_var = [['RD80'],['RD90']]
+ >>> q_var = [['FP79'],['FP89']]
+
+ To run a spatial lag model, we need to specify the spatial weights matrix.
+ To do that, we can open an already existing gal file or create a new one.
+ In this example, we will create a new one from NAT.shp and transform it to
+ row-standardized.
+
+ >>> w = libpysal.weights.Queen.from_dataframe(df)
+ >>> w.transform='r'
+
+ We can now run the regression and then have a summary of the output by typing:
+ print(reg.summary)
+
+ Alternatively, we can just check the betas and standard errors, asymptotic t
+ and p-value of the parameters:
+
+ >>> reg = SURlagIV(y_var,x_var,yend_var,q_var,w=w,df=df,name_ds="NAT",name_w="nat_queen")
+ >>> reg.b3SLS
+ {0: array([[ 6.95472387],
+ [ 1.44044301],
+ [-0.00771893],
+ [ 3.65051153],
+ [ 0.00362663]]), 1: array([[ 5.61101925],
+ [ 1.38716801],
+ [-0.15512029],
+ [ 3.1884457 ],
+ [ 0.25832185]])}
+
+ >>> reg.tsls_inf
+ {0: array([[ 0.49128435, 14.15620899, 0. ],
+ [ 0.11516292, 12.50787151, 0. ],
+ [ 0.03204088, -0.2409087 , 0.80962588],
+ [ 0.1876025 , 19.45875745, 0. ],
+ [ 0.05450628, 0.06653605, 0.94695106]]), 1: array([[ 0.44969956, 12.47726211, 0. ],
+ [ 0.10440241, 13.28674277, 0. ],
+ [ 0.04150243, -3.73761961, 0.00018577],
+ [ 0.19133145, 16.66451427, 0. ],
+ [ 0.04394024, 5.87893596, 0. ]])}
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ bigy,
+ bigX,
+ bigyend=None,
+ bigq=None,
+ w=None,
+ df=None,
+ regimes=None,
+ vm=False,
+ regime_lag_sep=False,
+ w_lags=1,
+ lag_q=True,
+ nonspat_diag=True,
+ spat_diag=False,
+ name_bigy=None,
+ name_bigX=None,
+ name_bigyend=None,
+ name_bigq=None,
+ name_ds=None,
+ name_w=None,
+ name_regimes=None,
+ ):
+
+ ifisinstance(bigy,list)orisinstance(bigX,list)orisinstance(bigyend,list)orisinstance(bigq,list):
+ ifisinstance(bigy,list)andisinstance(bigX,list)and(isinstance(bigyend,list)orbigyendisNone)and(isinstance(bigq,list)orbigqisNone):
+ if(len(bigy)==len(bigX)and
+ (bigyendisNoneorlen(bigy)==len(bigyend))and
+ (bigqisNoneorlen(bigy)==len(bigq))):
+ ifdfisnotNone:
+ bigy,bigX,name_bigy,name_bigX=sur_dictxy(df,bigy,bigX)
+ ifbigyendisnotNone:
+ bigyend,name_bigyend=sur_dictZ(df,bigyend)
+ bigq,name_bigq=sur_dictZ(df,bigq)
+ else:
+ raiseException("Error: df argument is required if bigy, bigX, bigyend (if provided) and bigq (if provided) are lists")
+ else:
+ raiseException("Error: bigy, bigX, bigyend (if provided) and bigq (if provided) must have the same number of elements")
+ else:
+ raiseException("Error: bigy, bigX, bigyend (if provided) and bigq (if provided) must be all lists or all dictionaries")
+
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.n_eq=len(bigy.keys())
+
+ ifwisNone:
+ raiseException("Spatial weights required for SUR-Lag")
+ self.w=w
+ WS=w.sparse
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.name_w=USER.set_name_w(name_w,w)
+ ifbigyendandnot(bigq):
+ raiseException("Instruments needed when endogenous variables")
+ # initialize
+ self.bigy=bigy
+ self.n_eq=len(self.bigy.keys())
+ ifname_bigy:
+ self.name_bigy=name_bigy
+ else:# need to construct y names
+ self.name_bigy={}
+ forrinrange(self.n_eq):
+ yn="dep_var_"+str(r+1)
+ self.name_bigy[r]=yn
+ # self.bigX = bigX
+ ifname_bigXisNone:
+ name_bigX={}
+ forrinrange(self.n_eq):
+ k=bigX[r].shape[1]-1
+ name_x=["var_"+str(i+1)+"_"+str(r+1)foriinrange(k)]
+ ct="Constant_"+str(r+1)# NOTE: constant always included in X
+ name_x.insert(0,ct)
+ name_bigX[r]=name_x
+ ifname_bigyendisNone:
+ name_bigyend={}
+ ifbigyendisnotNone:# check on other endogenous
+ self.bigyend=bigyend
+ forrinrange(self.n_eq):
+ ky=bigyend[r].shape[1]
+ name_ye=["end_"+str(i+1)+"_"+str(r+1)foriinrange(ky)]
+ name_bigyend[r]=name_ye
+ ifname_bigqisNone:
+ name_bigq={}
+ ifbigqisnotNone:# check on instruments
+ self.bigq=bigq
+ forrinrange(self.n_eq):
+ ki=bigq[r].shape[1]
+ name_i=["inst_"+str(i+1)+"_"+str(r+1)foriinrange(ki)]
+ name_bigq[r]=name_i
+
+ ifregimesisnotNone:
+ self.constant_regi="many"
+ self.cols2regi="all"
+ self.regime_err_sep=False
+ self.name_regimes=USER.set_name_ds(name_regimes)
+ self.regimes_set=REGI._get_regimes_set(regimes)
+ self.regimes=regimes
+ cols2regi_dic={}
+ self.name_bigX,self.name_bigq,self.name_bigyend={},{},{}
+ self.name_x_r=name_bigX
+
+ # spatial lag dependent variable varying across regimes
+ ifregime_lag_sep==True:
+ bigyend,name_bigyend=_get_spatial_lag(
+ self,bigyend,WS,name_bigyend
+ )
+
+ forrinrange(self.n_eq):
+ ifbigyendisnotNone:
+ self.name_x_r[r]+=name_bigyend[r]
+ cols2regi_dic[r]=REGI.check_cols2regi(
+ self.constant_regi,
+ self.cols2regi,
+ bigX[r],
+ yend=bigyend[r],
+ add_cons=False,
+ )
+ else:
+ cols2regi_dic[r]=REGI.check_cols2regi(
+ self.constant_regi,self.cols2regi,bigX[r],add_cons=False
+ )
+ USER.check_regimes(self.regimes_set,bigy[0].shape[0],bigX[r].shape[1])
+ bigX[r],self.name_bigX[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigX[r],
+ regimes,
+ constant_regi=None,
+ cols2regi=cols2regi_dic[r],
+ names=name_bigX[r],
+ )
+ ifbigqisnotNone:
+ bigq[r],self.name_bigq[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigq[r],
+ regimes,
+ constant_regi=None,
+ cols2regi="all",
+ names=name_bigq[r],
+ )
+ ifbigyendisnotNone:
+ bigyend[r],self.name_bigyend[r]=REGI.Regimes_Frame.__init__(
+ self,
+ bigyend[r],
+ regimes,
+ constant_regi=None,
+ cols2regi=cols2regi_dic[r],
+ yend=True,
+ names=name_bigyend[r],
+ )
+ else:
+ self.name_bigX,self.name_bigq,self.name_bigyend=(
+ name_bigX,
+ name_bigq,
+ name_bigyend,
+ )
+
+ # spatial lag dependent variable fixed across regimes or no regimes
+ ifregimesisNoneorregime_lag_sep==False:
+ bigyend,self.name_bigyend=_get_spatial_lag(
+ self,bigyend,WS,name_bigyend
+ )
+ # spatially lagged exogenous variables
+ bigwx={}
+ wxnames={}
+ ifw_lags==1:
+ forrinrange(self.n_eq):
+ bigwx[r]=WS*bigX[r][:,1:]
+ wxnames[r]=["W_"+iforiinself.name_bigX[r][1:]]
+ ifbigq:# other instruments
+ iflag_q:# also lags for instruments
+ bigwq={}
+ forrinrange(self.n_eq):
+ bigwq=WS*bigq[r]
+ bigq[r]=np.hstack((bigq[r],bigwx[r],bigwq))
+ wqnames=["W_"+iforiinself.name_bigq[r]]
+ wxnames[r]=wxnames[r]+wqnames
+ self.name_bigq[r]=self.name_bigq[r]+wxnames[r]
+ else:# no lags for other instruments
+ forrinrange(self.n_eq):
+ bigq[r]=np.hstack((bigq[r],bigwx[r]))
+ self.name_bigq[r]=self.name_bigq[r]+wxnames[r]
+ else:# no other instruments only wx
+ bigq={}
+ forrinrange(self.n_eq):
+ bigq[r]=bigwx[r]
+ self.name_bigq[r]=wxnames[r]
+ elifw_lags>1:# higher order lags for WX
+ forrinrange(self.n_eq):
+ bigwxwork=WS*bigX[r][:,1:]
+ bigwx[r]=bigwxwork
+ nameswork=["W_"+iforiinself.name_bigX[r][1:]]
+ wxnames[r]=nameswork
+ foriinrange(1,w_lags):
+ bigwxwork=WS*bigwxwork
+ bigwx[r]=np.hstack((bigwx[r],bigwxwork))
+ nameswork=["W"+iforiinnameswork]
+ wxnames[r]=wxnames[r]+nameswork
+ ifbigq:# other instruments
+ iflag_q:# lags for other instruments
+ wq={}
+ wqnames={}
+ forrinrange(self.n_eq):
+ bigwq=WS*bigq[r]
+ wqnameswork=["W_"+iforiinself.name_bigq[r]]
+ wqnames[r]=wqnameswork
+ wq[r]=bigwq
+ foriinrange(1,w_lags):
+ bigwq=WS*bigwq
+ wq[r]=np.hstack((wq[r],bigwq))
+ wqnameswork=["W"+iforiinwqnameswork]
+ wqnames[r]=wqnames[r]+wqnameswork
+ bigq[r]=np.hstack((bigq[r],bigwx[r],wq[r]))
+ self.name_bigq[r]=self.name_bigq[r]+wxnames[r]+wqnames[r]
+
+ else:# no lags for other instruments
+ forrinrange(self.n_eq):
+ bigq[r]=np.hstack((bigq[r],bigwx[r]))
+ self.name_bigq[r]=self.name_bigq[r]+wxnames[r]
+ else:# no other instruments only wx
+ bigq={}
+ forrinrange(self.n_eq):
+ bigq[r]=bigwx[r]
+ self.name_bigq[r]=wxnames[r]
+
+ else:
+ raiseException("Lag order must be 1 or higher")
+
+ BaseThreeSLS.__init__(
+ self,bigy=self.bigy,bigX=bigX,bigyend=bigyend,bigq=bigq
+ )
+
+ # inference
+ self.tsls_inf=sur_setp(self.b3SLS,self.varb)
+
+ # test on joint significance of spatial coefficients
+ ifspat_diag:
+ self.joinrho=sur_joinrho(self.n_eq,self.bigK,self.b3SLS,self.varb)
+ else:
+ self.joinrho=None
+
+ # test on constancy of coefficients across equations
+ ifcheck_k(self.bigK):# only for equal number of variables
+ self.surchow=sur_chow(self.n_eq,self.bigK,self.b3SLS,self.varb)
+ else:
+ self.surchow=None
+
+ # list results
+ self.title="SEEMINGLY UNRELATED REGRESSIONS (SUR) - SPATIAL LAG MODEL"
+ ifregimesisnotNone:
+ self.title="SUR - SPATIAL LAG MODEL - REGIMES"
+ self.chow_regimes={}
+ varb_counter=0
+ fixed_lag=1
+ ifregime_lag_sep==True:
+ fixed_lag+=-1
+ forrinrange(self.n_eq):
+ counter_end=varb_counter+self.b3SLS[r].shape[0]
+ self.chow_regimes[r]=REGI._chow_run(
+ len(cols2regi_dic[r]),
+ fixed_lag,
+ 0,
+ len(self.regimes_set),
+ self.b3SLS[r],
+ self.varb[varb_counter:counter_end,varb_counter:counter_end],
+ )
+ varb_counter=counter_end
+ regimes=True
+ SUMMARY.SUR(
+ reg=self,
+ tsls=True,
+ spat_diag=spat_diag,
+ nonspat_diag=nonspat_diag,
+ ml=False,
+ regimes=regimes,
+ )
+importnumpyasnp
+importnumpy.linalgasla
+from.importrobustasROBUST
+from.importuser_outputasUSER
+from.importdiagnosticsasDIAG
+from.outputimportoutput,_spat_diag_out,_summary_dwh
+from.utilsimportspdot,sphstack,RegressionPropsY,RegressionPropsVM,set_warn,get_lags
+importpandasaspd
+
+__author__="Luc Anselin lanselin@gmail.com, Pedro Amaral pedrovma@gmail.com, David C. Folch david.folch@asu.edu, Jing Yao jingyao@asu.edu"
+__all__=["TSLS"]
+
+
+classBaseTSLS(RegressionPropsY,RegressionPropsVM):
+
+"""
+ Two stage least squares (2SLS) (note: no consistency checks,
+ diagnostics or constant added)
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x); cannot be
+ used in combination with h
+ h : array
+ Two dimensional array with n rows and one column for each
+ exogenous variable to use as instruments (note: this
+ can contain variables from x); cannot be used in
+ combination with q
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given. If 'hac', then a
+ HAC consistent estimator of the variance-covariance
+ matrix is given. Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ kstar : integer
+ Number of endogenous variables.
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ hth : float
+ :math:`H'H`
+ hthi : float
+ :math:`(H'H)^{-1}`
+ varb : array
+ :math:`(Z'H (H'H)^{-1} H'Z)^{-1}`
+ zthhthi : array
+ :math:`Z'H(H'H)^{-1}`
+ pfora1a2 : array
+ :math:`n(zthhthi)'varb`
+
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),'r')
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> yd = []
+ >>> yd.append(db.by_col("HOVAL"))
+ >>> yd = np.array(yd).T
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+ >>> reg = spreg.twosls.BaseTSLS(y, X, yd, q=q)
+ >>> print(reg.betas.T)
+ [[88.46579584 0.5200379 -1.58216593]]
+ >>> reg = spreg.twosls.BaseTSLS(y, X, yd, q=q, robust="white")
+
+ """
+
+ def__init__(
+ self,y,x,yend,q=None,h=None,robust=None,gwk=None,sig2n_k=False
+ ):
+
+ ifissubclass(type(q),np.ndarray)andissubclass(type(h),np.ndarray):
+ raiseException("Please do not provide 'q' and 'h' together")
+ ifqisNoneandhisNone:
+ raiseException("Please provide either 'q' or 'h'")
+
+ self.y=y
+ self.n=y.shape[0]
+ self.x=x
+
+ self.kstar=yend.shape[1]
+ # including exogenous and endogenous variables
+ z=sphstack(self.x,yend)
+ iftype(h).__name__notin["ndarray","csr_matrix"]:
+ # including exogenous variables and instrument
+ h=sphstack(self.x,q)
+ self.z=z
+ self.h=h
+ self.q=q
+ self.yend=yend
+ # k = number of exogenous variables and endogenous variables
+ self.k=z.shape[1]
+ hth=spdot(h.T,h)
+
+ try:
+ hthi=la.inv(hth)
+ except:
+ raiseException("H'H singular - no inverse")
+
+ zth=spdot(z.T,h)
+ hty=spdot(h.T,y)
+ factor_1=np.dot(zth,hthi)
+ factor_2=np.dot(factor_1,zth.T)
+ # this one needs to be in cache to be used in AK
+
+ try:
+ varb=la.inv(factor_2)
+ except:
+ raiseException("Singular matrix Z'H(H'H)^-1H'Z - endogenous variable(s) may be part of X")
+
+ factor_3=np.dot(varb,factor_1)
+ betas=np.dot(factor_3,hty)
+ self.betas=betas
+ self.varb=varb
+ self.zthhthi=factor_1
+
+ # predicted values
+ self.predy=spdot(z,betas)
+
+ # residuals
+ u=y-self.predy
+ self.u=u
+
+ # attributes used in property
+ self.hth=hth# Required for condition index
+ self.hthi=hthi# Used in error models
+ self.htz=zth.T
+
+ ifrobust:
+ self.vm=ROBUST.robust_vm(reg=self,gwk=gwk,sig2n_k=sig2n_k)
+
+ ifsig2n_k:
+ self.sig2=self.sig2n_k
+ else:
+ self.sig2=self.sig2n
+
+ @property
+ defpfora1a2(self):
+ if"pfora1a2"notinself._cache:
+ self._cache["pfora1a2"]=self.n*np.dot(self.zthhthi.T,self.varb)
+ returnself._cache["pfora1a2"]
+
+ @property
+ defvm(self):
+ try:
+ returnself._cache["vm"]
+ exceptAttributeError:
+ self._cache={}
+ self._cache["vm"]=np.dot(self.sig2,self.varb)
+ exceptKeyError:
+ self._cache["vm"]=np.dot(self.sig2,self.varb)
+ returnself._cache["vm"]
+
+ @vm.setter
+ defvm(self,val):
+ try:
+ self._cache["vm"]=val
+ exceptAttributeError:
+ self._cache={}
+ self._cache["vm"]=val
+ exceptKeyError:
+ self._cache["vm"]=val
+
+
+
+[docs]
+classTSLS(BaseTSLS):
+"""
+ Two stage least squares with results and diagnostics.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ w : pysal W object
+ Spatial weights object (required if running spatial
+ diagnostics)
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given. If 'hac', then a
+ HAC consistent estimator of the variance-covariance
+ matrix is given. Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+ spat_diag : boolean
+ If True, then compute Anselin-Kelejian test (requires w)
+ nonspat_diag : boolean
+ If True, then compute non-spatial diagnostics
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ kstar : integer
+ Number of endogenous variables.
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ robust : string
+ Adjustment for robust standard errors
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ ak_test : tuple
+ Anselin-Kelejian test; tuple contains the pair (statistic,
+ p-value)
+ dwh : tuple
+ Durbin-Wu-Hausman test; tuple contains the pair (statistic,
+ p-value). Only returned if dwh=True.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ hth : float
+ :math:`H'H`
+ hthi : float
+ :math:`(H'H)^{-1}`
+ varb : array
+ :math:`(Z'H (H'H)^{-1} H'Z)^{-1}`
+ zthhthi : array
+ :math:`Z'H(H'H)^{-1}`
+ pfora1a2 : array
+ :math:`n(zthhthi)'varb`
+
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),'r')
+
+ Extract the CRIME column (crime rates) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("CRIME"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) vector from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this model adds a vector of ones to the
+ independent variables passed in, but this can be overridden by passing
+ constant=False.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X = np.array(X).T
+
+ In this case we consider HOVAL (home value) is an endogenous regressor.
+ We tell the model that this is so by passing it in a different parameter
+ from the exogenous variables (x).
+
+ >>> yd = []
+ >>> yd.append(db.by_col("HOVAL"))
+ >>> yd = np.array(yd).T
+
+ Because we have endogenous variables, to obtain a correct estimate of the
+ model, we need to instrument for HOVAL. We use DISCBD (distance to the
+ CBD) for this and hence put it in the instruments parameter, 'q'.
+
+ >>> q = []
+ >>> q.append(db.by_col("DISCBD"))
+ >>> q = np.array(q).T
+
+ We are all set with the preliminars, we are good to run the model. In this
+ case, we will need the variables (exogenous and endogenous) and the
+ instruments. If we want to have the names of the variables printed in the
+ output summary, we will have to pass them in as well, although this is optional.
+
+ >>> from spreg import TSLS
+ >>> reg = TSLS(y, X, yd, q, name_x=['inc'], name_y='crime', name_yend=['hoval'], name_q=['discbd'], name_ds='columbus')
+ >>> print(reg.betas.T)
+ [[88.46579584 0.5200379 -1.58216593]]
+ """
+
+
+importnumpyasnp
+importmultiprocessingasmp
+importpandasaspd
+from.importregimesasREGI
+from.importuser_outputasUSER
+from.utilsimportset_warn,RegressionProps_basic,spdot,sphstack,get_lags
+from.twoslsimportBaseTSLS
+from.robustimporthac_multi
+from.outputimportoutput,_spat_diag_out
+
+"""
+Two-stage Least Squares estimation with regimes.
+"""
+
+__author__="Luc Anselin, Pedro V. Amaral, David C. Folch"
+
+
+
+[docs]
+classTSLS_Regimes(BaseTSLS,REGI.Regimes_Frame):
+
+"""
+ Two stage least squares (2SLS) with regimes.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x)
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given.
+ If 'hac', then a HAC consistent estimator of the
+ variance-covariance matrix is given.
+ If 'ogmm', then Optimal GMM is used to estimate
+ betas and the variance-covariance matrix.
+ Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the SLX type.
+ Note: WX is computed using the complete weights matrix
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ vm : array
+ Variance covariance matrix (kxk)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: [False, 'one', 'many']
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal.examples import load_example
+ >>> from libpysal.weights import Rook
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> nat = load_example('Natregimes')
+ >>> db = libpysal.io.open(nat.get_path('natregimes.dbf'), 'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ In this case we consider RD90 (resource deprivation) as an endogenous regressor.
+ We tell the model that this is so by passing it in a different parameter
+ from the exogenous variables (x).
+
+ >>> yd_var = ['RD90']
+ >>> yd = np.array([db.by_col(name) for name in yd_var]).T
+
+ Because we have endogenous variables, to obtain a correct estimate of the
+ model, we need to instrument for RD90. We use FP89 (families below poverty)
+ for this and hence put it in the instruments parameter, 'q'.
+
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to perform tests for spatial dependence, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations into the error component of the model. To do that, we can open
+ an already existing gal file or create a new one. In this case, we will
+ create one from ``NAT.shp``.
+
+ >>> w = Rook.from_shapefile(nat.get_path("natregimes.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ We can now run the regression and then have a summary of the output
+ by typing: model.summary
+ Alternatively, we can just check the betas and standard errors of the
+ parameters:
+
+ >>> from spreg import TSLS_Regimes
+ >>> tslsr = TSLS_Regimes(y, x, yd, q, regimes, w=w, constant_regi='many', spat_diag=False, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT', name_w='NAT.shp')
+
+ >>> tslsr.betas
+ array([[ 3.66973562],
+ [ 1.06950466],
+ [ 0.14680946],
+ [ 2.45864196],
+ [ 9.55873243],
+ [ 1.94666348],
+ [-0.30810214],
+ [ 3.68718119]])
+
+ >>> np.sqrt(tslsr.vm.diagonal())
+ array([0.38389901, 0.09963973, 0.04672091, 0.22725012, 0.49181223,
+ 0.19630774, 0.07784587, 0.25529011])
+
+ >>> print(tslsr.summary)
+ REGRESSION RESULTS
+ ------------------
+ <BLANKLINE>
+ SUMMARY OF OUTPUT: TWO STAGE LEAST SQUARES ESTIMATION - REGIME 0
+ ----------------------------------------------------------------
+ Data set : NAT
+ Weights matrix : NAT.shp
+ Dependent Variable : 0_HR90 Number of Observations: 1673
+ Mean dependent var : 3.3416 Number of Variables : 4
+ S.D. dependent var : 4.6795 Degrees of Freedom : 1669
+ Pseudo R-squared : 0.2092
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error z-Statistic Probability
+ ------------------------------------------------------------------------------------
+ 0_CONSTANT 3.6697356 0.3838990 9.5591172 0.0000000
+ 0_PS90 1.0695047 0.0996397 10.7337170 0.0000000
+ 0_UE90 0.1468095 0.0467209 3.1422643 0.0016765
+ 0_RD90 2.4586420 0.2272501 10.8191009 0.0000000
+ ------------------------------------------------------------------------------------
+ Instrumented: 0_RD90
+ Instruments: 0_FP89
+ Regimes variable: SOUTH
+ <BLANKLINE>
+ SUMMARY OF OUTPUT: TWO STAGE LEAST SQUARES ESTIMATION - REGIME 1
+ ----------------------------------------------------------------
+ Data set : NAT
+ Weights matrix : NAT.shp
+ Dependent Variable : 1_HR90 Number of Observations: 1412
+ Mean dependent var : 9.5493 Number of Variables : 4
+ S.D. dependent var : 7.0389 Degrees of Freedom : 1408
+ Pseudo R-squared : 0.2987
+ <BLANKLINE>
+ ------------------------------------------------------------------------------------
+ Variable Coefficient Std.Error z-Statistic Probability
+ ------------------------------------------------------------------------------------
+ 1_CONSTANT 9.5587324 0.4918122 19.4357356 0.0000000
+ 1_PS90 1.9466635 0.1963077 9.9163867 0.0000000
+ 1_UE90 -0.3081021 0.0778459 -3.9578483 0.0000756
+ 1_RD90 3.6871812 0.2552901 14.4431026 0.0000000
+ ------------------------------------------------------------------------------------
+ Instrumented: 1_RD90
+ Instruments: 1_FP89
+ Regimes variable: SOUTH
+ ------------------------------------------------------------------------------------
+ GLOBAL DIAGNOSTICS
+ <BLANKLINE>
+ REGIMES DIAGNOSTICS - CHOW TEST
+ VARIABLE DF VALUE PROB
+ CONSTANT 1 89.093 0.0000
+ PS90 1 15.876 0.0001
+ UE90 1 25.106 0.0000
+ RD90 1 12.920 0.0003
+ Global test 4 201.237 0.0000
+ ================================ END OF REPORT =====================================
+ """
+
+
+"""
+Spatial Two Stages Least Squares
+"""
+
+__author__="Luc Anselin lanselin@gmail.com, David C. Folch david.folch@asu.edu"
+
+importnumpyasnp
+from.importtwoslsasTSLS
+from.importuser_outputasUSER
+from.utilsimportset_endog,sp_att,set_warn
+importpandasaspd
+from.outputimportoutput,_spat_diag_out,_spat_pseudo_r2,_summary_impacts
+fromitertoolsimportcompress
+
+__all__=["GM_Lag"]
+
+
+classBaseGM_Lag(TSLS.BaseTSLS):
+"""
+ Spatial two stage least squares (S2SLS) (note: no consistency checks,
+ diagnostics or constant added); Anselin (1988) [Anselin1988]_
+
+ Parameters
+ ----------
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable; assumes the constant is
+ in column 0.
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x); cannot be
+ used in combination with h
+ w : Pysal weights matrix
+ Spatial weights matrix
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the Spatial Durbin type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given. If 'hac', then a
+ HAC consistent estimator of the variance-covariance
+ matrix is given. Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+
+
+ Attributes
+ ----------
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ predy : array
+ nx1 array of predicted y values
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ kstar : integer
+ Number of endogenous variables.
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ hth : float
+ H'H
+ hthi : float
+ (H'H)^-1
+ varb : array
+ (Z'H (H'H)^-1 H'Z)^-1
+ zthhthi : array
+ Z'H(H'H)^-1
+ pfora1a2 : array
+ n(zthhthi)'varb
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+ >>> w.transform = 'r'
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),'r')
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+ >>> # no non-spatial endogenous variables
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("CRIME"))
+ >>> X = np.array(X).T
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> reg = spreg.twosls_sp.BaseGM_Lag(y, X, w=w, w_lags=2)
+ >>> reg.betas
+ array([[45.30170561],
+ [ 0.62088862],
+ [-0.48072345],
+ [ 0.02836221]])
+ >>> spreg.se_betas(reg)
+ array([17.91278862, 0.52486082, 0.1822815 , 0.31740089])
+ >>> reg = spreg.twosls_sp.BaseGM_Lag(y, X, w=w, w_lags=2, robust='white')
+ >>> reg.betas
+ array([[45.30170561],
+ [ 0.62088862],
+ [-0.48072345],
+ [ 0.02836221]])
+ >>> spreg.se_betas(reg)
+ array([20.47077481, 0.50613931, 0.20138425, 0.38028295])
+ >>> # instrument for HOVAL with DISCBD
+ >>> X = np.array(db.by_col("INC"))
+ >>> X = np.reshape(X, (49,1))
+ >>> yd = np.array(db.by_col("CRIME"))
+ >>> yd = np.reshape(yd, (49,1))
+ >>> q = np.array(db.by_col("DISCBD"))
+ >>> q = np.reshape(q, (49,1))
+ >>> X = np.hstack((np.ones(y.shape),X))
+ >>> reg = spreg.twosls_sp.BaseGM_Lag(y, X, w=w, yend=yd, q=q, w_lags=2)
+ >>> reg.betas
+ array([[100.79359082],
+ [ -0.50215501],
+ [ -1.14881711],
+ [ -0.38235022]])
+ >>> spreg.se_betas(reg)
+ array([53.0829123 , 1.02511494, 0.57589064, 0.59891744])
+
+ """
+
+ def__init__(
+ self,
+ y,
+ x,
+ yend=None,
+ q=None,
+ w=None,
+ w_lags=1,
+ slx_lags=0,
+ slx_vars="All",
+ lag_q=True,
+ robust=None,
+ gwk=None,
+ sig2n_k=False,
+ ):
+
+
+
+ ifslx_lags>0:
+ yend2,q2,wx=set_endog(y,x[:,1:],w,yend,q,w_lags,lag_q,slx_lags,slx_vars)
+ x=np.hstack((x,wx))
+ else:
+ yend2,q2=set_endog(y,x[:,1:],w,yend,q,w_lags,lag_q)
+
+
+
+ TSLS.BaseTSLS.__init__(
+ self,y=y,x=x,yend=yend2,q=q2,robust=robust,gwk=gwk,sig2n_k=sig2n_k
+ )
+
+
+
+[docs]
+classGM_Lag(BaseGM_Lag):
+"""
+ Spatial two stage least squares (S2SLS) with results and diagnostics;
+ Anselin (1988) :cite:`Anselin1988`
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x); cannot be
+ used in combination with h
+ w : pysal W object
+ Spatial weights object
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the Spatial Durbin type.
+ slx_vars : either "All" (default) or list of booleans to select x variables
+ to be lagged
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given. If 'hac', then a
+ HAC consistent estimator of the variance-covariance
+ matrix is given. Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+ spat_diag : boolean
+ If True, then compute Anselin-Kelejian test and Common Factor Hypothesis test (if applicable)
+ spat_impacts : string or list
+ Include average direct impact (ADI), average indirect impact (AII),
+ and average total impact (ATI) in summary results.
+ Options are 'simple', 'full', 'power', 'all' or None.
+ See sputils.spmultiplier for more information.
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ hard_bound : boolean
+ If true, raises an exception if the estimated spatial
+ autoregressive parameter is outside the bounds of -1 and 1.
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ kstar : integer
+ Number of endogenous variables.
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ z : array
+ nxk array of variables (combination of x and yend)
+ h : array
+ nxl array of instruments (combination of x and q)
+ robust : string
+ Adjustment for robust standard errors
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ std_err : array
+ 1xk array of standard errors of the betas
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ ak_test : tuple
+ Anselin-Kelejian test; tuple contains the pair (statistic,
+ p-value)
+ cfh_test : tuple
+ Common Factor Hypothesis test; tuple contains the pair (statistic,
+ p-value). Only when it applies (see specific documentation).
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ title : string
+ Name of the regression method used
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ hth : float
+ :math:`H'H`
+ hthi : float
+ :math:`(H'H)^{-1}`
+ varb : array
+ :math:`(Z'H (H'H)^{-1} H'Z)^{-1}`
+ zthhthi : array
+ :math:`Z'H(H'H)^{-1}`
+ pfora1a2 : array
+ n(zthhthi)'varb
+ sp_multipliers: dict
+ Dictionary of spatial multipliers (if spat_impacts is not None)
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis. Since we will need some tests for our
+ model, we also import the diagnostics module.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> import spreg
+
+ Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
+ This is the DBF associated with the Columbus shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"),'r')
+
+ Extract the HOVAL column (home value) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y = np.array(db.by_col("HOVAL"))
+ >>> y = np.reshape(y, (49,1))
+
+ Extract INC (income) and CRIME (crime rates) vectors from the DBF to be used as
+ independent variables in the regression. Note that PySAL requires this to
+ be an nxj numpy array, where j is the number of independent variables (not
+ including a constant). By default this model adds a vector of ones to the
+ independent variables passed in, but this can be overridden by passing
+ constant=False.
+
+ >>> X = []
+ >>> X.append(db.by_col("INC"))
+ >>> X.append(db.by_col("CRIME"))
+ >>> X = np.array(X).T
+
+ Since we want to run a spatial error model, we need to specify the spatial
+ weights matrix that includes the spatial configuration of the observations
+ into the error component of the model. To do that, we can open an already
+ existing gal file or create a new one. In this case, we will create one
+ from ``columbus.shp``.
+
+ >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ This class runs a lag model, which means that includes the spatial lag of
+ the dependent variable on the right-hand side of the equation. If we want
+ to have the names of the variables printed in the
+ output summary, we will have to pass them in as well, although this is
+ optional. The default most basic model to be run would be:
+
+ >>> from spreg import GM_Lag
+ >>> np.set_printoptions(suppress=True) #prevent scientific format
+ >>> reg=GM_Lag(y, X, w=w, w_lags=2, name_x=['inc', 'crime'], name_y='hoval', name_ds='columbus')
+ >>> reg.betas
+ array([[45.30170561],
+ [ 0.62088862],
+ [-0.48072345],
+ [ 0.02836221]])
+
+ Once the model is run, we can obtain the standard error of the coefficient
+ estimates by calling the diagnostics module:
+
+ >>> spreg.se_betas(reg)
+ array([17.91278862, 0.52486082, 0.1822815 , 0.31740089])
+
+ But we can also run models that incorporates corrected standard errors
+ following the White procedure. For that, we will have to include the
+ optional parameter ``robust='white'``:
+
+ >>> reg=GM_Lag(y, X, w=w, w_lags=2, robust='white', name_x=['inc', 'crime'], name_y='hoval', name_ds='columbus')
+ >>> reg.betas
+ array([[45.30170561],
+ [ 0.62088862],
+ [-0.48072345],
+ [ 0.02836221]])
+
+ And we can access the standard errors from the model object:
+
+ >>> reg.std_err
+ array([20.47077481, 0.50613931, 0.20138425, 0.38028295])
+
+ The class is flexible enough to accomodate a spatial lag model that,
+ besides the spatial lag of the dependent variable, includes other
+ non-spatial endogenous regressors. As an example, we will assume that
+ CRIME is actually endogenous and we decide to instrument for it with
+ DISCBD (distance to the CBD). We reload the X including INC only and
+ define CRIME as endogenous and DISCBD as instrument:
+
+ >>> X = np.array(db.by_col("INC"))
+ >>> X = np.reshape(X, (49,1))
+ >>> yd = np.array(db.by_col("CRIME"))
+ >>> yd = np.reshape(yd, (49,1))
+ >>> q = np.array(db.by_col("DISCBD"))
+ >>> q = np.reshape(q, (49,1))
+
+ And we can run the model again:
+
+ >>> reg=GM_Lag(y, X, w=w, yend=yd, q=q, w_lags=2, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus')
+ >>> reg.betas
+ array([[100.79359082],
+ [ -0.50215501],
+ [ -1.14881711],
+ [ -0.38235022]])
+
+ Once the model is run, we can obtain the standard error of the coefficient
+ estimates by calling the diagnostics module:
+
+ >>> spreg.se_betas(reg)
+ array([53.0829123 , 1.02511494, 0.57589064, 0.59891744])
+
+ """
+
+
+[docs]
+ def__init__(
+ self,
+ y,
+ x,
+ yend=None,
+ q=None,
+ w=None,
+ w_lags=1,
+ lag_q=True,
+ slx_lags=0,
+ slx_vars="All",
+ robust=None,
+ gwk=None,
+ sig2n_k=False,
+ spat_diag=True,
+ spat_impacts="simple",
+ vm=False,
+ name_y=None,
+ name_x=None,
+ name_yend=None,
+ name_q=None,
+ name_w=None,
+ name_gwk=None,
+ name_ds=None,
+ latex=False,
+ hard_bound=False,
+ ):
+
+ n=USER.check_arrays(x,yend,q)
+ y,name_y=USER.check_y(y,n,name_y)
+ w=USER.check_weights(w,y,w_required=True,slx_lags=slx_lags)
+ USER.check_robust(robust,gwk)
+ yend,q,name_yend,name_q=USER.check_endog([yend,q],[name_yend,name_q])
+ spat_diag,warn=USER.check_spat_diag(spat_diag=spat_diag,w=w,robust=robust,slx_lags=slx_lags)
+ set_warn(self,warn)
+ x_constant,name_x,warn=USER.check_constant(x,name_x)
+ set_warn(self,warn)
+ name_x=USER.set_name_x(name_x,x_constant)# need to check for None and set defaults
+
+ # kx and wkx are used to replace complex calculation for output
+ ifslx_lags>0:# adjust for flexwx
+ if(isinstance(slx_vars,list)):# slx_vars has True,False
+ iflen(slx_vars)!=x.shape[1]:
+ raiseException("slx_vars incompatible with x column dimensions")
+ else:# use slx_vars to extract proper columns
+ workname=name_x[1:]
+ kx=len(workname)
+ vv=list(compress(workname,slx_vars))
+ name_x+=USER.set_name_spatial_lags(vv,slx_lags)
+ wkx=slx_vars.count(True)
+ else:
+ kx=len(name_x)-1
+ wkx=kx
+ name_x+=USER.set_name_spatial_lags(name_x[1:],slx_lags)# exclude constant
+
+
+ BaseGM_Lag.__init__(
+ self,
+ y=y,
+ x=x_constant,
+ w=w,
+ yend=yend,
+ q=q,
+ w_lags=w_lags,
+ slx_lags=slx_lags,
+ slx_vars=slx_vars,
+ robust=robust,
+ gwk=gwk,
+ lag_q=lag_q,
+ sig2n_k=sig2n_k,
+ )
+
+ self.rho=self.betas[-1]
+ self.predy_e,self.e_pred,warn=sp_att(
+ w,self.y,self.predy,self.yend[:,-1].reshape(self.n,1),self.rho,hard_bound=hard_bound
+ )
+ set_warn(self,warn)
+ self.title="SPATIAL TWO STAGE LEAST SQUARES"
+ ifslx_lags>0:
+ self.title+=" WITH SLX (SPATIAL DURBIN MODEL)"
+ self.name_ds=USER.set_name_ds(name_ds)
+ self.name_y=USER.set_name_y(name_y)
+ # self.name_x = USER.set_name_x(name_x, x_constant) # name_x contains SLX terms for slx_lags > 0
+ self.name_x=name_x# already contains constant in new setup
+ self.name_yend=USER.set_name_yend(name_yend,yend)
+ self.name_yend.append(USER.set_name_yend_sp(self.name_y))
+ self.name_z=self.name_x+self.name_yend
+ self.name_q=USER.set_name_q(name_q,q)
+
+ ifslx_lags>0:# need to remove all but last SLX variables from name_x
+ self.name_x0=[]
+ self.name_x0.append(self.name_x[0])# constant
+ if(isinstance(slx_vars,list)):# boolean list passed
+ # x variables that were not lagged
+ self.name_x0.extend(list(compress(self.name_x[1:],[notiforiinslx_vars])))
+ # last wkx variables
+ self.name_x0.extend(self.name_x[-wkx:])
+
+
+ else:
+ okx=int((self.k-self.kstar-1)/(slx_lags+1))# number of original exogenous vars
+
+ self.name_x0.extend(self.name_x[-okx:])
+
+ self.name_q.extend(USER.set_name_q_sp(self.name_x0,w_lags,self.name_q,lag_q))
+
+ #var_types = ['x'] * (kx + 1) + ['wx'] * kx * slx_lags + ['yend'] * (len(self.name_yend) - 1) + ['rho']
+ var_types=['x']*(kx+1)+['wx']*wkx*slx_lags+['yend']*(len(self.name_yend)-1)+['rho']
+ else:
+ self.name_q.extend(USER.set_name_q_sp(self.name_x,w_lags,self.name_q,lag_q))
+ var_types=['x']*len(self.name_x)+['yend']*(len(self.name_yend)-1)+['rho']
+
+ self.name_h=USER.set_name_h(self.name_x,self.name_q)
+ self.robust=USER.set_robust(robust)
+ self.name_w=USER.set_name_w(name_w,w)
+ self.name_gwk=USER.set_name_w(name_gwk,gwk)
+ self.slx_lags=slx_lags
+ self.slx_vars=slx_vars
+
+ self.output=pd.DataFrame(self.name_x+self.name_yend,columns=['var_names'])
+ self.output['var_type']=var_types
+ self.output['regime'],self.output['equation']=(0,0)
+ self.other_top=_spat_pseudo_r2(self)
+ diag_out=None
+
+ ifspat_diag:
+ diag_out=_spat_diag_out(self,w,'yend')
+ ifspat_impacts:
+ self.sp_multipliers,impacts_str=_summary_impacts(self,w,spat_impacts,slx_lags,slx_vars)
+ try:
+ diag_out+=impacts_str
+ exceptTypeError:
+ diag_out=impacts_str
+ output(reg=self,vm=vm,robust=robust,other_end=diag_out,latex=latex)
+"""
+Spatial Two Stages Least Squares with Regimes
+"""
+
+__author__="Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu, David C. Folch david.folch@asu.edu"
+
+importnumpyasnp
+importpandasaspd
+importmultiprocessingasmp
+from.importregimesasREGI
+from.importuser_outputasUSER
+from.twosls_regimesimportTSLS_Regimes,_optimal_weight
+from.twoslsimportBaseTSLS
+from.utilsimportset_endog,set_endog_sparse,sp_att,set_warn,sphstack,spdot,optim_k
+from.robustimporthac_multi
+from.outputimportoutput,_spat_diag_out,_spat_pseudo_r2,_summary_impacts
+from.skater_regimportSkater_reg
+from.twosls_spimportBaseGM_Lag
+
+
+[docs]
+classGM_Lag_Regimes(TSLS_Regimes,REGI.Regimes_Frame):
+
+"""
+ Spatial two stage least squares (S2SLS) with regimes;
+ :cite:`Anselin1988`
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ regimes : list or pandas.Series
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ yend : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ q : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ external exogenous variable to use as instruments (note:
+ this should not contain any variables from x); cannot be
+ used in combination with h
+ constant_regi: string
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime (default).
+ cols2regi : list, 'all'
+ Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all' (default), all the variables vary by regime.
+ w : pysal W object
+ Spatial weights object
+ w_lags : integer
+ Orders of W to include as instruments for the spatially
+ lagged dependent variable. For example, w_lags=1, then
+ instruments are WX; if w_lags=2, then WX, WWX; and so on.
+ lag_q : boolean
+ If True, then include spatial lags of the additional
+ instruments (q).
+ slx_lags : integer
+ Number of spatial lags of X to include in the model specification.
+ If slx_lags>0, the specification becomes of the Spatial Durbin type.
+ regime_lag_sep: boolean
+ If True (default), the spatial parameter for spatial lag is also
+ computed according to different regimes. If False,
+ the spatial parameter is fixed accross regimes.
+ Option valid only when regime_err_sep=True
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ robust : string
+ If 'white', then a White consistent estimator of the
+ variance-covariance matrix is given.
+ If 'hac', then a HAC consistent estimator of the
+ variance-covariance matrix is given.
+ If 'ogmm', then Optimal GMM is used to estimate
+ betas and the variance-covariance matrix.
+ Default set to None.
+ gwk : pysal W object
+ Kernel spatial weights needed for HAC estimation. Note:
+ matrix must have ones along the main diagonal.
+ sig2n_k : boolean
+ If True, then use n-k to estimate sigma^2. If False, use n.
+ spat_impacts : string or list
+ Include average direct impact (ADI), average indirect impact (AII),
+ and average total impact (ATI) in summary results.
+ Options are 'simple', 'full', 'power', 'all' or None.
+ See sputils.spmultiplier for more information.
+ spat_diag : boolean
+ If True, then compute Anselin-Kelejian test and Common Factor Hypothesis test (if applicable)
+ vm : boolean
+ If True, include variance-covariance matrix in summary
+ results
+ cores : boolean
+ Specifies if multiprocessing is to be used
+ Default: no multiprocessing, cores = False
+ Note: Multiprocessing may not work on all platforms.
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_q : list of strings
+ Names of instruments for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ kstar : integer
+ Number of endogenous variables.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z : array
+ nxk array of variables (combination of x and yend)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ h : array
+ nxl array of instruments (combination of x and q)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ robust : string
+ Adjustment for robust standard errors
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ ak_test : tuple
+ Anselin-Kelejian test; tuple contains the pair (statistic,
+ p-value)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ cfh_test : tuple
+ Common Factor Hypothesis test; tuple contains the pair (statistic,
+ p-value). Only when it applies (see specific documentation).
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ hth : float
+ :math:`H'H`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ hthi : float
+ :math:`(H'H)^{-1}`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ varb : array
+ :math:`(Z'H (H'H)^{-1} H'Z)^{-1}`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ zthhthi : array
+ :math:`Z'H(H'H)^{-1}`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pfora1a2 : array
+ n(zthhthi)'varb
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sp_multipliers: dict
+ Dictionary of spatial multipliers (if spat_impacts is not None)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime.
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_lag_sep: boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed accross regimes.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+
+ Examples
+ --------
+
+ We first need to import the needed modules, namely numpy to convert the
+ data we read into arrays that ``spreg`` understands and ``pysal`` to
+ perform all the analysis.
+
+ >>> import numpy as np
+ >>> import libpysal
+ >>> from libpysal import examples
+
+ Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open().
+ This is the DBF associated with the NAT shapefile. Note that
+ libpysal.io.open() also reads data in CSV format; since the actual class
+ requires data to be passed in as numpy arrays, the user can read their
+ data in using any method.
+
+ >>> db = libpysal.io.open(examples.get_path("NAT.dbf"),'r')
+
+ Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the
+ dependent variable for the regression. Note that PySAL requires this to be
+ an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
+ that other packages accept.
+
+ >>> y_var = 'HR90'
+ >>> y = np.array([db.by_col(y_var)]).reshape(3085,1)
+
+ Extract UE90 (unemployment rate) and PS90 (population structure) vectors from
+ the DBF to be used as independent variables in the regression. Other variables
+ can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...]
+ Note that PySAL requires this to be an nxj numpy array, where j is the
+ number of independent variables (not including a constant). By default
+ this model adds a vector of ones to the independent variables passed in.
+
+ >>> x_var = ['PS90','UE90']
+ >>> x = np.array([db.by_col(name) for name in x_var]).T
+
+ The different regimes in this data are given according to the North and
+ South dummy (SOUTH).
+
+ >>> r_var = 'SOUTH'
+ >>> regimes = db.by_col(r_var)
+
+ Since we want to run a spatial lag model, we need to specify
+ the spatial weights matrix that includes the spatial configuration of the
+ observations. To do that, we can open an already existing gal file or
+ create a new one. In this case, we will create one from ``NAT.shp``.
+
+ >>> from libpysal import weights
+ >>> w = weights.Rook.from_shapefile(examples.get_path("NAT.shp"))
+
+ Unless there is a good reason not to do it, the weights have to be
+ row-standardized so every row of the matrix sums to one. Among other
+ things, this allows to interpret the spatial lag of a variable as the
+ average value of the neighboring observations. In PySAL, this can be
+ easily performed in the following way:
+
+ >>> w.transform = 'r'
+
+ This class runs a lag model, which means that includes the spatial lag of
+ the dependent variable on the right-hand side of the equation. If we want
+ to have the names of the variables printed in the output summary, we will
+ have to pass them in as well, although this is optional.
+
+ >>> from spreg import GM_Lag_Regimes
+ >>> model=GM_Lag_Regimes(y, x, regimes, w=w, regime_lag_sep=False, regime_err_sep=False, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT', name_w='NAT.shp')
+ >>> model.betas
+ array([[ 1.28897623],
+ [ 0.79777722],
+ [ 0.56366891],
+ [ 8.73327838],
+ [ 1.30433406],
+ [ 0.62418643],
+ [-0.39993716]])
+
+ Once the model is run, we can have a summary of the output by typing:
+ model.summary . Alternatively, we can obtain the standard error of
+ the coefficient estimates by calling:
+
+ >>> model.std_err
+ array([0.38492902, 0.19106926, 0.06063249, 1.25607153, 0.36117334,
+ 0.092293 , 0.15116983])
+
+ In the example above, all coefficients but the spatial lag vary
+ according to the regime. It is also possible to have the spatial lag
+ varying according to the regime, which effective will result in an
+ independent spatial lag model estimated for each regime. To run these
+ models, the argument regime_lag_sep must be set to True:
+
+ >>> model=GM_Lag_Regimes(y, x, regimes, w=w, regime_lag_sep=True, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT', name_w='NAT.shp')
+ >>> print(model.output)
+ var_names coefficients std_err zt_stat prob
+ 0 0_CONSTANT 1.365848 0.385177 3.546023 0.000391
+ 1 0_PS90 0.808757 0.206672 3.91325 0.000091
+ 2 0_UE90 0.569468 0.067703 8.411247 0.0
+ 3 0_W_HR90 -0.434244 0.177208 -2.450478 0.014267
+ 4 1_CONSTANT 7.907311 1.772336 4.461518 0.000008
+ 5 1_PS90 1.274657 0.368306 3.460869 0.000538
+ 6 1_UE90 0.601677 0.102102 5.892907 0.0
+ 7 1_W_HR90 -0.296034 0.226243 -1.308474 0.190712
+
+ Alternatively, we can type: 'model.summary' to see the organized results output.
+ The class is flexible enough to accomodate a spatial lag model that,
+ besides the spatial lag of the dependent variable, includes other
+ non-spatial endogenous regressors. As an example, we will add the endogenous
+ variable RD90 (resource deprivation) and we decide to instrument for it with
+ FP89 (families below poverty):
+
+ >>> yd_var = ['RD90']
+ >>> yd = np.array([db.by_col(name) for name in yd_var]).T
+ >>> q_var = ['FP89']
+ >>> q = np.array([db.by_col(name) for name in q_var]).T
+
+ And we can run the model again:
+
+ >>> model = GM_Lag_Regimes(y, x, regimes, yend=yd, q=q, w=w, regime_lag_sep=False, regime_err_sep=False, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT', name_w='NAT.shp')
+ >>> model.betas
+ array([[ 3.42195202],
+ [ 1.03311878],
+ [ 0.14308741],
+ [ 8.99740066],
+ [ 1.91877758],
+ [-0.32084816],
+ [ 2.38918212],
+ [ 3.67243761],
+ [ 0.06959139]])
+
+ Once the model is run, we can obtain the standard error of the coefficient
+ estimates. Alternatively, we can have a summary of the output by typing:
+ model.summary
+
+ >>> model.std_err
+ array([0.49529467, 0.18912143, 0.05157813, 0.92277557, 0.33711135,
+ 0.08993181, 0.33506177, 0.36381449, 0.07209498])
+ """
+
+
+[docs]
+classGM_Lag_Endog_Regimes(GM_Lag_Regimes):
+
+"""
+ Spatial two stage least squares (S2SLS) with endogenous regimes.
+ Based on the function skater_reg as shown in :cite:`Anselin2021`.
+
+ Parameters
+ ----------
+ y : numpy.ndarray or pandas.Series
+ nx1 array for dependent variable
+ x : numpy.ndarray or pandas object
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, excluding the constant
+ w : pysal W object
+ Spatial weights object (required if running spatial
+ diagnostics)
+ n_clusters : int
+ Number of clusters to be used in the endogenous regimes.
+ If None (default), the number of clusters will be chosen
+ according to the function utils.optim_k using a method
+ adapted from Mojena (1977)'s Rule Two
+ quorum : int
+ Minimum number of observations in a cluster to be considered
+ Must be at least larger than the number of variables in x
+ Default value is 30 or 10*k, whichever is larger.
+ trace : boolean
+ Sets whether to store intermediate results of the clustering
+ Hard-coded to True if n_clusters is None
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ latex : boolean
+ Specifies if summary is to be printed in latex format
+ **kwargs : additional keyword arguments depending on the specific model
+
+ Attributes
+ ----------
+ output : dataframe
+ regression results pandas dataframe
+ summary : string
+ Summary of regression results and diagnostics (note: use in
+ conjunction with the print command)
+ betas : array
+ kx1 array of estimated coefficients
+ u : array
+ nx1 array of residuals
+ e_pred : array
+ nx1 array of residuals (using reduced form)
+ predy : array
+ nx1 array of predicted y values
+ predy_e : array
+ nx1 array of predicted y values (using reduced form)
+ n : integer
+ Number of observations
+ k : integer
+ Number of variables for which coefficients are estimated
+ (including the constant)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ kstar : integer
+ Number of endogenous variables.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ y : array
+ nx1 array for dependent variable
+ x : array
+ Two dimensional array with n rows and one column for each
+ independent (exogenous) variable, including the constant
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ yend : array
+ Two dimensional array with n rows and one column for each
+ endogenous variable
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ q : array
+ Two dimensional array with n rows and one column for each
+ external exogenous variable used as instruments
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z : array
+ nxk array of variables (combination of x and yend)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ h : array
+ nxl array of instruments (combination of x and q)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ robust : string
+ Adjustment for robust standard errors
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ mean_y : float
+ Mean of dependent variable
+ std_y : float
+ Standard deviation of dependent variable
+ vm : array
+ Variance covariance matrix (kxk)
+ pr2 : float
+ Pseudo R squared (squared correlation between y and ypred)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pr2_e : float
+ Pseudo R squared (squared correlation between y and ypred_e
+ (using reduced form))
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ utu : float
+ Sum of squared residuals
+ sig2 : float
+ Sigma squared used in computations
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ std_err : array
+ 1xk array of standard errors of the betas
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ z_stat : list of tuples
+ z statistic; each tuple contains the pair (statistic,
+ p-value), where each is a float
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ ak_test : tuple
+ Anselin-Kelejian test; tuple contains the pair (statistic,
+ p-value)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ cfh_test : tuple
+ Common Factor Hypothesis test; tuple contains the pair (statistic,
+ p-value). Only when it applies (see specific documentation).
+ name_y : string
+ Name of dependent variable for use in output
+ name_x : list of strings
+ Names of independent variables for use in output
+ name_yend : list of strings
+ Names of endogenous variables for use in output
+ name_z : list of strings
+ Names of exogenous and endogenous variables for use in
+ output
+ name_q : list of strings
+ Names of external instruments
+ name_h : list of strings
+ Names of all instruments used in ouput
+ name_w : string
+ Name of weights matrix for use in output
+ name_gwk : string
+ Name of kernel weights matrix for use in output
+ name_ds : string
+ Name of dataset for use in output
+ name_regimes : string
+ Name of regimes variable for use in output
+ title : string
+ Name of the regression method used
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sig2n : float
+ Sigma squared (computed with n in the denominator)
+ sig2n_k : float
+ Sigma squared (computed with n-k in the denominator)
+ hth : float
+ :math:`H'H`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ hthi : float
+ :math:`(H'H)^{-1}`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ varb : array
+ :math:`(Z'H (H'H)^{-1} H'Z)^{-1}`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ zthhthi : array
+ :math:`Z'H(H'H)^{-1}`.
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ pfora1a2 : array
+ n(zthhthi)'varb
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ sp_multipliers: dict
+ Dictionary of spatial multipliers (if spat_impacts is not None)
+ Only available in dictionary 'multi' when multiple regressions
+ (see 'multi' below for details)
+ regimes : list
+ List of n values with the mapping of each
+ observation to a regime. Assumed to be aligned with 'x'.
+ constant_regi: string
+ Ignored if regimes=False. Constant option for regimes.
+ Switcher controlling the constant term setup. It may take
+ the following values:
+
+ * 'one': a vector of ones is appended to x and held constant across regimes.
+
+ * 'many': a vector of ones is appended to x and considered different per regime.
+ cols2regi : list, 'all'
+ Ignored if regimes=False. Argument indicating whether each
+ column of x should be considered as different per regime
+ or held constant across regimes (False).
+ If a list, k booleans indicating for each variable the
+ option (True if one per regime, False to be held constant).
+ If 'all', all the variables vary by regime.
+ regime_lag_sep: boolean
+ If True, the spatial parameter for spatial lag is also
+ computed according to different regimes. If False (default),
+ the spatial parameter is fixed accross regimes.
+ regime_err_sep: boolean
+ If True, a separate regression is run for each regime.
+ kr : int
+ Number of variables/columns to be "regimized" or subject
+ to change by regime. These will result in one parameter
+ estimate by regime for each variable (i.e. nr parameters per
+ variable)
+ kf : int
+ Number of variables/columns to be considered fixed or
+ global across regimes and hence only obtain one parameter
+ estimate
+ nr : int
+ Number of different regimes in the 'regimes' list
+ multi : dictionary
+ Only available when multiple regressions are estimated,
+ i.e. when regime_err_sep=True and no variable is fixed
+ across regimes.
+ Contains all attributes of each individual regression
+ SSR : list
+ list with the total sum of squared residuals for the model
+ considering all regimes for each of steps of number of regimes
+ considered, starting with the solution with 2 regimes.
+ clusters : int
+ Number of clusters considered in the endogenous regimes
+ _trace : list
+ List of dictionaries with the clustering results for each
+ number of clusters tested. Only available if n_clusters is
+ None or trace=True.
+ Examples
+ --------
+ >>> import libpysal
+ >>> import numpy as np
+ >>> np.set_printoptions(legacy='1.25') #to avoid printing issues with numpy floats
+ >>> import geopandas as gpd
+ >>> from spreg import OLS_Endog_Regimes
+
+ Open data on Baltimore house sales price and characteristics in Baltimore
+ from libpysal examples using geopandas.
+
+ >>> db = gpd.read_file(libpysal.examples.get_path('baltim.shp'))
+
+ We will create a weights matrix based on contiguity.
+
+ >>> w = libpysal.weights.Queen.from_dataframe(db, use_index=True)
+ >>> w.transform = "r"
+
+ For this example, we will use the 'PRICE' column as the dependent variable and
+ the 'NROOM', 'AGE', and 'SQFT' columns as independent variables.
+ At this point, we will let the model choose the number of clusters.
+
+ >>> reg = GM_Lag_Endog_Regimes(y=db['PRICE'], x=db[['NROOM','AGE','SQFT']], w=w, name_w="baltim_q.gal")
+
+ The function `print(reg.summary)` can be used to visualize the results of the regression.
+
+ Alternatively, we can check individual attributes:
+ >>> reg.betas
+ array([[ 6.20932938],
+ [ 4.25581944],
+ [-0.1468118 ],
+ [ 0.40893082],
+ [ 5.01866492],
+ [ 4.84994184],
+ [-0.55425337],
+ [ 1.04577632],
+ [ 0.05155043]])
+ >>> reg.SSR
+ [59784.06769835169, 56858.621800274515]
+ >>> reg.clusters
+ array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
+ 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], dtype=int32)
+
+ We will now set the number of clusters to 2 and run the regression again.
+
+ >>> reg = GM_Lag_Endog_Regimes(y=db['PRICE'], x=db[['NROOM','AGE','SQFT']], w=w, n_clusters=2, name_w="baltim_q.gal")
+
+ The function `print(reg.summary)` can be used to visualize the results of the regression.
+
+ Alternatively, we can check individual attributes as before:
+ >>> reg.betas
+ array([[ 6.20932938],
+ [ 4.25581944],
+ [-0.1468118 ],
+ [ 0.40893082],
+ [ 5.01866492],
+ [ 4.84994184],
+ [-0.55425337],
+ [ 1.04577632],
+ [ 0.05155043]])
+ >>> reg.SSR
+ [59784.06769835169]
+ >>> reg.clusters
+ array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
+ 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], dtype=int32)
+
+ """
+
+
+
+[docs]
+ def__init__(
+ self,y,x,w,n_clusters=None,quorum=-1,trace=True,name_y=None,name_x=None,**kwargs):
+
+ n=USER.check_arrays(y,x)
+ y,name_y=USER.check_y(y,n,name_y)
+ w=USER.check_weights(w,y,w_required=True)
+ x_constant,name_x,warn=USER.check_constant(x,name_x,just_rem=True)
+ set_warn(self,warn)
+ # Standardize the variables
+ x_std=(x_constant-np.mean(x_constant,axis=0))/np.std(x_constant,axis=0)
+
+ ifquorum<0:
+ quorum=np.max([(x.shape[1]+1)*10,30])
+
+ ifnotn_clusters:
+ n_clusters_opt=x_constant.shape[0]*0.70//quorum
+ ifn_clusters_opt<2:
+ raiseValueError(
+ "The combination of the values of `N` and `quorum` is not compatible with regimes estimation.")
+ sk_reg_results=Skater_reg().fit(n_clusters_opt,w,x_std,{'reg':BaseGM_Lag,'y':y,'x':x_constant,'w':w},quorum=quorum,trace=True)
+ n_clusters=optim_k([sk_reg_results._trace[i][1][2]foriinrange(1,len(sk_reg_results._trace))])
+ self.clusters=sk_reg_results._trace[n_clusters-1][0]
+ else:
+ try:
+ # Call the Skater_reg method based on GM_Lag
+ sk_reg_results=Skater_reg().fit(n_clusters,w,x_std,{'reg':BaseGM_Lag,'y':y,'x':x_constant,'w':w},quorum=quorum,trace=trace)
+ self.clusters=sk_reg_results.current_labels_
+ exceptExceptionase:
+ ifstr(e)=="one or more input arrays have more columns than rows":
+ raiseValueError("One or more input ended up with more variables than observations. Please check your setting for `quorum`.")
+ else:
+ print("An error occurred:",e)
+
+ self._trace=sk_reg_results._trace
+ self.SSR=[self._trace[i][1][2]foriinrange(1,len(self._trace))]
+
+ GM_Lag_Regimes.__init__(self,y,x,regimes=self.clusters,w=w,name_y=name_y,name_x=name_x,name_regimes='Skater_reg',**kwargs)
+
+
+
\ No newline at end of file
diff --git a/_sources/api.rst.txt b/_sources/api.rst.txt
new file mode 100644
index 00000000..699392f6
--- /dev/null
+++ b/_sources/api.rst.txt
@@ -0,0 +1,186 @@
+.. _api_ref:
+
+.. currentmodule:: spreg
+
+API reference
+=============
+
+.. _models_api:
+
+Classic Models
+--------------
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.OLS
+ spreg.TSLS
+
+Spatial Regression Models
+-------------------------
+
+These are the standard spatial regression models supported by the `spreg` package. Each of them contains a significant amount of detail in their docstring discussing how they're used, how they're fit, and how to interpret the results.
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.GM_Lag
+ spreg.ML_Lag
+ spreg.GMM_Error
+ spreg.ML_Error
+ spreg.GM_Error
+ spreg.GM_Error_Het
+ spreg.GM_Error_Hom
+ spreg.GM_Combo
+ spreg.GM_Combo_Het
+ spreg.GM_Combo_Hom
+ spreg.GM_Endog_Error
+ spreg.GM_Endog_Error_Het
+ spreg.GM_Endog_Error_Hom
+ spreg.NSLX
+
+Discrete Choice Models
+----------------------
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.Probit
+
+Regimes Models
+---------------
+
+Regimes models are variants of spatial regression models which allow for structural instability in parameters. That means that these models allow different coefficient values in distinct subsets of the data.
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.OLS_Regimes
+ spreg.TSLS_Regimes
+ spreg.ML_Lag_Regimes
+ spreg.ML_Error_Regimes
+ spreg.GM_Lag_Regimes
+ spreg.GM_Error_Regimes
+ spreg.GM_Error_Het_Regimes
+ spreg.GM_Error_Hom_Regimes
+ spreg.GM_Combo_Regimes
+ spreg.GM_Combo_Hom_Regimes
+ spreg.GM_Combo_Het_Regimes
+ spreg.GM_Endog_Error_Regimes
+ spreg.GM_Endog_Error_Hom_Regimes
+ spreg.GM_Endog_Error_Het_Regimes
+ spreg.OLS_Endog_Regimes
+ spreg.GM_Lag_Endog_Regimes
+ spreg.Skater_reg
+
+Seemingly-Unrelated Regressions
+--------------------------------
+
+Seemingly-unrelated regression models are a generalization of linear regression. These models (and their spatial generalizations) allow for correlation in the residual terms between groups that use the same model. In spatial Seeimingly-Unrelated Regressions, the error terms across groups are allowed to exhibit a structured type of correlation: spatial correlation.
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.SUR
+ spreg.SURerrorGM
+ spreg.SURerrorML
+ spreg.SURlagIV
+ spreg.ThreeSLS
+
+Spatial Panel Models
+--------------------
+
+Spatial panel models allow for evaluating correlation in both spatial and time dimensions.
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.Panel_FE_Lag
+ spreg.Panel_FE_Error
+ spreg.Panel_RE_Lag
+ spreg.Panel_RE_Error
+ spreg.GM_KKP
+
+Diagnostics
+-----------
+
+Diagnostic tests are useful for identifying model fit, sufficiency, and specification correctness.
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.f_stat
+ spreg.t_stat
+ spreg.r2
+ spreg.ar2
+ spreg.se_betas
+ spreg.log_likelihood
+ spreg.akaike
+ spreg.schwarz
+ spreg.condition_index
+ spreg.dwh
+ spreg.jarque_bera
+ spreg.breusch_pagan
+ spreg.white
+ spreg.koenker_bassett
+ spreg.vif
+ spreg.likratiotest
+ spreg.LMtests
+ spreg.MoranRes
+ spreg.AKtest
+ spreg.sur_setp
+ spreg.sur_lrtest
+ spreg.sur_lmtest
+ spreg.lam_setp
+ spreg.surLMe
+ spreg.surLMlag
+ spreg.constant_check
+ spreg.panel_LMlag
+ spreg.panel_LMerror
+ spreg.panel_rLMlag
+ spreg.panel_rLMerror
+ spreg.panel_Hausman
+
+
+Spatial Specification Search
+--------------------------------
+
+The `spsearch` module contains tools for conducting incremental specification searches for spatial econometric models following the approach of :cite:p:`anselin2024SpatialEconometric`
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.spsearch.stge_classic
+ spreg.spsearch.stge_kb
+ spreg.spsearch.stge_pre
+ spreg.spsearch.gets_gns
+ spreg.spsearch.gets_sdm
+
+
+DGP
+-----------
+
+Tools for simulating synthetic data according to data-generating processes implied by different spatial model specifications
+
+.. autosummary::
+ :toctree: generated/
+
+ spreg.dgp.make_error
+ spreg.dgp.make_x
+ spreg.dgp.make_wx
+ spreg.dgp.make_xb
+ spreg.dgp.make_wxg
+ spreg.dgp.dgp_errproc
+ spreg.dgp.dgp_ols
+ spreg.dgp.dgp_slx
+ spreg.dgp.dgp_sperror
+ spreg.dgp.dgp_slxerror
+ spreg.dgp.dgp_lag
+ spreg.dgp.dgp_spdurbin
+ spreg.dgp.dgp_lagerr
+ spreg.dgp.dgp_gns
+ spreg.dgp.dgp_mess
+ spreg.dgp.dgp_probit
+ spreg.dgp.make_bin
+ spreg.dgp.make_heterror
+ spreg.dgp.make_vmult
\ No newline at end of file
diff --git a/_sources/generated/spreg.AKtest.rst.txt b/_sources/generated/spreg.AKtest.rst.txt
new file mode 100644
index 00000000..c88779f4
--- /dev/null
+++ b/_sources/generated/spreg.AKtest.rst.txt
@@ -0,0 +1,22 @@
+ο»Ώspreg.AKtest
+============
+
+.. currentmodule:: spreg
+
+.. autoclass:: AKtest
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~AKtest.__init__
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GMM_Error.rst.txt b/_sources/generated/spreg.GMM_Error.rst.txt
new file mode 100644
index 00000000..1d860297
--- /dev/null
+++ b/_sources/generated/spreg.GMM_Error.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GMM\_Error
+================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GMM_Error
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GMM_Error.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GMM_Error.mean_y
+ ~GMM_Error.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Combo.rst.txt b/_sources/generated/spreg.GM_Combo.rst.txt
new file mode 100644
index 00000000..f13b9a47
--- /dev/null
+++ b/_sources/generated/spreg.GM_Combo.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Combo
+===============
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Combo
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Combo.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Combo.mean_y
+ ~GM_Combo.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Combo_Het.rst.txt b/_sources/generated/spreg.GM_Combo_Het.rst.txt
new file mode 100644
index 00000000..48bf0149
--- /dev/null
+++ b/_sources/generated/spreg.GM_Combo_Het.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Combo\_Het
+====================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Combo_Het
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Combo_Het.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Combo_Het.mean_y
+ ~GM_Combo_Het.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Combo_Het_Regimes.rst.txt b/_sources/generated/spreg.GM_Combo_Het_Regimes.rst.txt
new file mode 100644
index 00000000..574157d7
--- /dev/null
+++ b/_sources/generated/spreg.GM_Combo_Het_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Combo\_Het\_Regimes
+=============================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Combo_Het_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Combo_Het_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Combo_Het_Regimes.mean_y
+ ~GM_Combo_Het_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Combo_Hom.rst.txt b/_sources/generated/spreg.GM_Combo_Hom.rst.txt
new file mode 100644
index 00000000..64e73a55
--- /dev/null
+++ b/_sources/generated/spreg.GM_Combo_Hom.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Combo\_Hom
+====================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Combo_Hom
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Combo_Hom.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Combo_Hom.mean_y
+ ~GM_Combo_Hom.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Combo_Hom_Regimes.rst.txt b/_sources/generated/spreg.GM_Combo_Hom_Regimes.rst.txt
new file mode 100644
index 00000000..243e2d0a
--- /dev/null
+++ b/_sources/generated/spreg.GM_Combo_Hom_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Combo\_Hom\_Regimes
+=============================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Combo_Hom_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Combo_Hom_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Combo_Hom_Regimes.mean_y
+ ~GM_Combo_Hom_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Combo_Regimes.rst.txt b/_sources/generated/spreg.GM_Combo_Regimes.rst.txt
new file mode 100644
index 00000000..1104d6cd
--- /dev/null
+++ b/_sources/generated/spreg.GM_Combo_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Combo\_Regimes
+========================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Combo_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Combo_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Combo_Regimes.mean_y
+ ~GM_Combo_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Endog_Error.rst.txt b/_sources/generated/spreg.GM_Endog_Error.rst.txt
new file mode 100644
index 00000000..b67bcf4b
--- /dev/null
+++ b/_sources/generated/spreg.GM_Endog_Error.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Endog\_Error
+======================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Endog_Error
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Endog_Error.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Endog_Error.mean_y
+ ~GM_Endog_Error.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Endog_Error_Het.rst.txt b/_sources/generated/spreg.GM_Endog_Error_Het.rst.txt
new file mode 100644
index 00000000..dad1f183
--- /dev/null
+++ b/_sources/generated/spreg.GM_Endog_Error_Het.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Endog\_Error\_Het
+===========================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Endog_Error_Het
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Het.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Het.mean_y
+ ~GM_Endog_Error_Het.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Endog_Error_Het_Regimes.rst.txt b/_sources/generated/spreg.GM_Endog_Error_Het_Regimes.rst.txt
new file mode 100644
index 00000000..182dfdf7
--- /dev/null
+++ b/_sources/generated/spreg.GM_Endog_Error_Het_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Endog\_Error\_Het\_Regimes
+====================================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Endog_Error_Het_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Het_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Het_Regimes.mean_y
+ ~GM_Endog_Error_Het_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Endog_Error_Hom.rst.txt b/_sources/generated/spreg.GM_Endog_Error_Hom.rst.txt
new file mode 100644
index 00000000..13b66621
--- /dev/null
+++ b/_sources/generated/spreg.GM_Endog_Error_Hom.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Endog\_Error\_Hom
+===========================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Endog_Error_Hom
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Hom.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Hom.mean_y
+ ~GM_Endog_Error_Hom.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Endog_Error_Hom_Regimes.rst.txt b/_sources/generated/spreg.GM_Endog_Error_Hom_Regimes.rst.txt
new file mode 100644
index 00000000..3ebc774d
--- /dev/null
+++ b/_sources/generated/spreg.GM_Endog_Error_Hom_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Endog\_Error\_Hom\_Regimes
+====================================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Endog_Error_Hom_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Hom_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Hom_Regimes.mean_y
+ ~GM_Endog_Error_Hom_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Endog_Error_Regimes.rst.txt b/_sources/generated/spreg.GM_Endog_Error_Regimes.rst.txt
new file mode 100644
index 00000000..ee89a29e
--- /dev/null
+++ b/_sources/generated/spreg.GM_Endog_Error_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Endog\_Error\_Regimes
+===============================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Endog_Error_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Endog_Error_Regimes.mean_y
+ ~GM_Endog_Error_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Error.rst.txt b/_sources/generated/spreg.GM_Error.rst.txt
new file mode 100644
index 00000000..2ece4eca
--- /dev/null
+++ b/_sources/generated/spreg.GM_Error.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Error
+===============
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Error
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Error.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Error.mean_y
+ ~GM_Error.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Error_Het.rst.txt b/_sources/generated/spreg.GM_Error_Het.rst.txt
new file mode 100644
index 00000000..9cc1360e
--- /dev/null
+++ b/_sources/generated/spreg.GM_Error_Het.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Error\_Het
+====================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Error_Het
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Error_Het.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Error_Het.mean_y
+ ~GM_Error_Het.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Error_Het_Regimes.rst.txt b/_sources/generated/spreg.GM_Error_Het_Regimes.rst.txt
new file mode 100644
index 00000000..6a3fafcb
--- /dev/null
+++ b/_sources/generated/spreg.GM_Error_Het_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Error\_Het\_Regimes
+=============================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Error_Het_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Error_Het_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Error_Het_Regimes.mean_y
+ ~GM_Error_Het_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Error_Hom.rst.txt b/_sources/generated/spreg.GM_Error_Hom.rst.txt
new file mode 100644
index 00000000..1a47ae25
--- /dev/null
+++ b/_sources/generated/spreg.GM_Error_Hom.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Error\_Hom
+====================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Error_Hom
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Error_Hom.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Error_Hom.mean_y
+ ~GM_Error_Hom.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Error_Hom_Regimes.rst.txt b/_sources/generated/spreg.GM_Error_Hom_Regimes.rst.txt
new file mode 100644
index 00000000..1c3a9c30
--- /dev/null
+++ b/_sources/generated/spreg.GM_Error_Hom_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Error\_Hom\_Regimes
+=============================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Error_Hom_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Error_Hom_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Error_Hom_Regimes.mean_y
+ ~GM_Error_Hom_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Error_Regimes.rst.txt b/_sources/generated/spreg.GM_Error_Regimes.rst.txt
new file mode 100644
index 00000000..ba5a00cc
--- /dev/null
+++ b/_sources/generated/spreg.GM_Error_Regimes.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_Error\_Regimes
+========================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Error_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Error_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Error_Regimes.mean_y
+ ~GM_Error_Regimes.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_KKP.rst.txt b/_sources/generated/spreg.GM_KKP.rst.txt
new file mode 100644
index 00000000..abeb8f31
--- /dev/null
+++ b/_sources/generated/spreg.GM_KKP.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.GM\_KKP
+=============
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_KKP
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_KKP.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_KKP.mean_y
+ ~GM_KKP.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Lag.rst.txt b/_sources/generated/spreg.GM_Lag.rst.txt
new file mode 100644
index 00000000..cb8f7bbb
--- /dev/null
+++ b/_sources/generated/spreg.GM_Lag.rst.txt
@@ -0,0 +1,34 @@
+ο»Ώspreg.GM\_Lag
+=============
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Lag
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Lag.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Lag.mean_y
+ ~GM_Lag.pfora1a2
+ ~GM_Lag.sig2n
+ ~GM_Lag.sig2n_k
+ ~GM_Lag.std_y
+ ~GM_Lag.utu
+ ~GM_Lag.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Lag_Endog_Regimes.rst.txt b/_sources/generated/spreg.GM_Lag_Endog_Regimes.rst.txt
new file mode 100644
index 00000000..dfc7f5d6
--- /dev/null
+++ b/_sources/generated/spreg.GM_Lag_Endog_Regimes.rst.txt
@@ -0,0 +1,36 @@
+ο»Ώspreg.GM\_Lag\_Endog\_Regimes
+=============================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Lag_Endog_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Lag_Endog_Regimes.GM_Lag_Regimes_Multi
+ ~GM_Lag_Endog_Regimes.__init__
+ ~GM_Lag_Endog_Regimes.sp_att_reg
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Lag_Endog_Regimes.mean_y
+ ~GM_Lag_Endog_Regimes.pfora1a2
+ ~GM_Lag_Endog_Regimes.sig2n
+ ~GM_Lag_Endog_Regimes.sig2n_k
+ ~GM_Lag_Endog_Regimes.std_y
+ ~GM_Lag_Endog_Regimes.utu
+ ~GM_Lag_Endog_Regimes.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.GM_Lag_Regimes.rst.txt b/_sources/generated/spreg.GM_Lag_Regimes.rst.txt
new file mode 100644
index 00000000..964390e4
--- /dev/null
+++ b/_sources/generated/spreg.GM_Lag_Regimes.rst.txt
@@ -0,0 +1,36 @@
+ο»Ώspreg.GM\_Lag\_Regimes
+======================
+
+.. currentmodule:: spreg
+
+.. autoclass:: GM_Lag_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~GM_Lag_Regimes.GM_Lag_Regimes_Multi
+ ~GM_Lag_Regimes.__init__
+ ~GM_Lag_Regimes.sp_att_reg
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~GM_Lag_Regimes.mean_y
+ ~GM_Lag_Regimes.pfora1a2
+ ~GM_Lag_Regimes.sig2n
+ ~GM_Lag_Regimes.sig2n_k
+ ~GM_Lag_Regimes.std_y
+ ~GM_Lag_Regimes.utu
+ ~GM_Lag_Regimes.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.LMtests.rst.txt b/_sources/generated/spreg.LMtests.rst.txt
new file mode 100644
index 00000000..8e52c18f
--- /dev/null
+++ b/_sources/generated/spreg.LMtests.rst.txt
@@ -0,0 +1,22 @@
+ο»Ώspreg.LMtests
+=============
+
+.. currentmodule:: spreg
+
+.. autoclass:: LMtests
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~LMtests.__init__
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.ML_Error.rst.txt b/_sources/generated/spreg.ML_Error.rst.txt
new file mode 100644
index 00000000..94ba51ec
--- /dev/null
+++ b/_sources/generated/spreg.ML_Error.rst.txt
@@ -0,0 +1,34 @@
+ο»Ώspreg.ML\_Error
+===============
+
+.. currentmodule:: spreg
+
+.. autoclass:: ML_Error
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~ML_Error.__init__
+ ~ML_Error.get_x_lag
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~ML_Error.mean_y
+ ~ML_Error.sig2n
+ ~ML_Error.sig2n_k
+ ~ML_Error.std_y
+ ~ML_Error.utu
+ ~ML_Error.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.ML_Error_Regimes.rst.txt b/_sources/generated/spreg.ML_Error_Regimes.rst.txt
new file mode 100644
index 00000000..65442bfa
--- /dev/null
+++ b/_sources/generated/spreg.ML_Error_Regimes.rst.txt
@@ -0,0 +1,34 @@
+ο»Ώspreg.ML\_Error\_Regimes
+========================
+
+.. currentmodule:: spreg
+
+.. autoclass:: ML_Error_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~ML_Error_Regimes.__init__
+ ~ML_Error_Regimes.get_x_lag
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~ML_Error_Regimes.mean_y
+ ~ML_Error_Regimes.sig2n
+ ~ML_Error_Regimes.sig2n_k
+ ~ML_Error_Regimes.std_y
+ ~ML_Error_Regimes.utu
+ ~ML_Error_Regimes.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.ML_Lag.rst.txt b/_sources/generated/spreg.ML_Lag.rst.txt
new file mode 100644
index 00000000..c862afd9
--- /dev/null
+++ b/_sources/generated/spreg.ML_Lag.rst.txt
@@ -0,0 +1,33 @@
+ο»Ώspreg.ML\_Lag
+=============
+
+.. currentmodule:: spreg
+
+.. autoclass:: ML_Lag
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~ML_Lag.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~ML_Lag.mean_y
+ ~ML_Lag.sig2n
+ ~ML_Lag.sig2n_k
+ ~ML_Lag.std_y
+ ~ML_Lag.utu
+ ~ML_Lag.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.ML_Lag_Regimes.rst.txt b/_sources/generated/spreg.ML_Lag_Regimes.rst.txt
new file mode 100644
index 00000000..c025abe1
--- /dev/null
+++ b/_sources/generated/spreg.ML_Lag_Regimes.rst.txt
@@ -0,0 +1,34 @@
+ο»Ώspreg.ML\_Lag\_Regimes
+======================
+
+.. currentmodule:: spreg
+
+.. autoclass:: ML_Lag_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~ML_Lag_Regimes.ML_Lag_Regimes_Multi
+ ~ML_Lag_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~ML_Lag_Regimes.mean_y
+ ~ML_Lag_Regimes.sig2n
+ ~ML_Lag_Regimes.sig2n_k
+ ~ML_Lag_Regimes.std_y
+ ~ML_Lag_Regimes.utu
+ ~ML_Lag_Regimes.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.MoranRes.rst.txt b/_sources/generated/spreg.MoranRes.rst.txt
new file mode 100644
index 00000000..ea1c616e
--- /dev/null
+++ b/_sources/generated/spreg.MoranRes.rst.txt
@@ -0,0 +1,22 @@
+ο»Ώspreg.MoranRes
+==============
+
+.. currentmodule:: spreg
+
+.. autoclass:: MoranRes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~MoranRes.__init__
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.NSLX.rst.txt b/_sources/generated/spreg.NSLX.rst.txt
new file mode 100644
index 00000000..b9af58d1
--- /dev/null
+++ b/_sources/generated/spreg.NSLX.rst.txt
@@ -0,0 +1,29 @@
+ο»Ώspreg.NSLX
+==========
+
+.. currentmodule:: spreg
+
+.. autoclass:: NSLX
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~NSLX.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~NSLX.mean_y
+ ~NSLX.std_y
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.OLS.rst.txt b/_sources/generated/spreg.OLS.rst.txt
new file mode 100644
index 00000000..67fd5ef5
--- /dev/null
+++ b/_sources/generated/spreg.OLS.rst.txt
@@ -0,0 +1,33 @@
+ο»Ώspreg.OLS
+=========
+
+.. currentmodule:: spreg
+
+.. autoclass:: OLS
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~OLS.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~OLS.mean_y
+ ~OLS.sig2n
+ ~OLS.sig2n_k
+ ~OLS.std_y
+ ~OLS.utu
+ ~OLS.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.OLS_Endog_Regimes.rst.txt b/_sources/generated/spreg.OLS_Endog_Regimes.rst.txt
new file mode 100644
index 00000000..e8823945
--- /dev/null
+++ b/_sources/generated/spreg.OLS_Endog_Regimes.rst.txt
@@ -0,0 +1,33 @@
+ο»Ώspreg.OLS\_Endog\_Regimes
+=========================
+
+.. currentmodule:: spreg
+
+.. autoclass:: OLS_Endog_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~OLS_Endog_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~OLS_Endog_Regimes.mean_y
+ ~OLS_Endog_Regimes.sig2n
+ ~OLS_Endog_Regimes.sig2n_k
+ ~OLS_Endog_Regimes.std_y
+ ~OLS_Endog_Regimes.utu
+ ~OLS_Endog_Regimes.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.OLS_Regimes.rst.txt b/_sources/generated/spreg.OLS_Regimes.rst.txt
new file mode 100644
index 00000000..83a71f2e
--- /dev/null
+++ b/_sources/generated/spreg.OLS_Regimes.rst.txt
@@ -0,0 +1,33 @@
+ο»Ώspreg.OLS\_Regimes
+==================
+
+.. currentmodule:: spreg
+
+.. autoclass:: OLS_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~OLS_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~OLS_Regimes.mean_y
+ ~OLS_Regimes.sig2n
+ ~OLS_Regimes.sig2n_k
+ ~OLS_Regimes.std_y
+ ~OLS_Regimes.utu
+ ~OLS_Regimes.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.Panel_FE_Error.rst.txt b/_sources/generated/spreg.Panel_FE_Error.rst.txt
new file mode 100644
index 00000000..a0d8ff66
--- /dev/null
+++ b/_sources/generated/spreg.Panel_FE_Error.rst.txt
@@ -0,0 +1,33 @@
+ο»Ώspreg.Panel\_FE\_Error
+======================
+
+.. currentmodule:: spreg
+
+.. autoclass:: Panel_FE_Error
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~Panel_FE_Error.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~Panel_FE_Error.mean_y
+ ~Panel_FE_Error.sig2n
+ ~Panel_FE_Error.sig2n_k
+ ~Panel_FE_Error.std_y
+ ~Panel_FE_Error.utu
+ ~Panel_FE_Error.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.Panel_FE_Lag.rst.txt b/_sources/generated/spreg.Panel_FE_Lag.rst.txt
new file mode 100644
index 00000000..b7e82711
--- /dev/null
+++ b/_sources/generated/spreg.Panel_FE_Lag.rst.txt
@@ -0,0 +1,33 @@
+ο»Ώspreg.Panel\_FE\_Lag
+====================
+
+.. currentmodule:: spreg
+
+.. autoclass:: Panel_FE_Lag
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~Panel_FE_Lag.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~Panel_FE_Lag.mean_y
+ ~Panel_FE_Lag.sig2n
+ ~Panel_FE_Lag.sig2n_k
+ ~Panel_FE_Lag.std_y
+ ~Panel_FE_Lag.utu
+ ~Panel_FE_Lag.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.Panel_RE_Error.rst.txt b/_sources/generated/spreg.Panel_RE_Error.rst.txt
new file mode 100644
index 00000000..4cf0cf96
--- /dev/null
+++ b/_sources/generated/spreg.Panel_RE_Error.rst.txt
@@ -0,0 +1,33 @@
+ο»Ώspreg.Panel\_RE\_Error
+======================
+
+.. currentmodule:: spreg
+
+.. autoclass:: Panel_RE_Error
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~Panel_RE_Error.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~Panel_RE_Error.mean_y
+ ~Panel_RE_Error.sig2n
+ ~Panel_RE_Error.sig2n_k
+ ~Panel_RE_Error.std_y
+ ~Panel_RE_Error.utu
+ ~Panel_RE_Error.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.Panel_RE_Lag.rst.txt b/_sources/generated/spreg.Panel_RE_Lag.rst.txt
new file mode 100644
index 00000000..4fd33117
--- /dev/null
+++ b/_sources/generated/spreg.Panel_RE_Lag.rst.txt
@@ -0,0 +1,33 @@
+ο»Ώspreg.Panel\_RE\_Lag
+====================
+
+.. currentmodule:: spreg
+
+.. autoclass:: Panel_RE_Lag
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~Panel_RE_Lag.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~Panel_RE_Lag.mean_y
+ ~Panel_RE_Lag.sig2n
+ ~Panel_RE_Lag.sig2n_k
+ ~Panel_RE_Lag.std_y
+ ~Panel_RE_Lag.utu
+ ~Panel_RE_Lag.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.Probit.rst.txt b/_sources/generated/spreg.Probit.rst.txt
new file mode 100644
index 00000000..29cc06ed
--- /dev/null
+++ b/_sources/generated/spreg.Probit.rst.txt
@@ -0,0 +1,49 @@
+ο»Ώspreg.Probit
+============
+
+.. currentmodule:: spreg
+
+.. autoclass:: Probit
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~Probit.__init__
+ ~Probit.gradient
+ ~Probit.hessian
+ ~Probit.ll
+ ~Probit.par_est
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~Probit.KP_error
+ ~Probit.LR
+ ~Probit.PS_error
+ ~Probit.Pinkse_error
+ ~Probit.phiy
+ ~Probit.predpc
+ ~Probit.predy
+ ~Probit.scale
+ ~Probit.slopes
+ ~Probit.slopes_std_err
+ ~Probit.slopes_vm
+ ~Probit.slopes_z_stat
+ ~Probit.u_gen
+ ~Probit.u_naive
+ ~Probit.vm
+ ~Probit.xb
+ ~Probit.xmean
+ ~Probit.z_stat
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.SUR.rst.txt b/_sources/generated/spreg.SUR.rst.txt
new file mode 100644
index 00000000..4748b11f
--- /dev/null
+++ b/_sources/generated/spreg.SUR.rst.txt
@@ -0,0 +1,22 @@
+ο»Ώspreg.SUR
+=========
+
+.. currentmodule:: spreg
+
+.. autoclass:: SUR
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~SUR.__init__
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.SURerrorGM.rst.txt b/_sources/generated/spreg.SURerrorGM.rst.txt
new file mode 100644
index 00000000..f9040480
--- /dev/null
+++ b/_sources/generated/spreg.SURerrorGM.rst.txt
@@ -0,0 +1,22 @@
+ο»Ώspreg.SURerrorGM
+================
+
+.. currentmodule:: spreg
+
+.. autoclass:: SURerrorGM
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~SURerrorGM.__init__
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.SURerrorML.rst.txt b/_sources/generated/spreg.SURerrorML.rst.txt
new file mode 100644
index 00000000..66b8dc33
--- /dev/null
+++ b/_sources/generated/spreg.SURerrorML.rst.txt
@@ -0,0 +1,22 @@
+ο»Ώspreg.SURerrorML
+================
+
+.. currentmodule:: spreg
+
+.. autoclass:: SURerrorML
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~SURerrorML.__init__
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.SURlagIV.rst.txt b/_sources/generated/spreg.SURlagIV.rst.txt
new file mode 100644
index 00000000..808666c6
--- /dev/null
+++ b/_sources/generated/spreg.SURlagIV.rst.txt
@@ -0,0 +1,22 @@
+ο»Ώspreg.SURlagIV
+==============
+
+.. currentmodule:: spreg
+
+.. autoclass:: SURlagIV
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~SURlagIV.__init__
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.Skater_reg.rst.txt b/_sources/generated/spreg.Skater_reg.rst.txt
new file mode 100644
index 00000000..20ba1961
--- /dev/null
+++ b/_sources/generated/spreg.Skater_reg.rst.txt
@@ -0,0 +1,26 @@
+ο»Ώspreg.Skater\_reg
+=================
+
+.. currentmodule:: spreg
+
+.. autoclass:: Skater_reg
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~Skater_reg.__init__
+ ~Skater_reg.find_cut
+ ~Skater_reg.fit
+ ~Skater_reg.score_spreg
+ ~Skater_reg.score_stats
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.TSLS.rst.txt b/_sources/generated/spreg.TSLS.rst.txt
new file mode 100644
index 00000000..695330ff
--- /dev/null
+++ b/_sources/generated/spreg.TSLS.rst.txt
@@ -0,0 +1,34 @@
+ο»Ώspreg.TSLS
+==========
+
+.. currentmodule:: spreg
+
+.. autoclass:: TSLS
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~TSLS.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~TSLS.mean_y
+ ~TSLS.pfora1a2
+ ~TSLS.sig2n
+ ~TSLS.sig2n_k
+ ~TSLS.std_y
+ ~TSLS.utu
+ ~TSLS.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.TSLS_Regimes.rst.txt b/_sources/generated/spreg.TSLS_Regimes.rst.txt
new file mode 100644
index 00000000..f6c01d86
--- /dev/null
+++ b/_sources/generated/spreg.TSLS_Regimes.rst.txt
@@ -0,0 +1,34 @@
+ο»Ώspreg.TSLS\_Regimes
+===================
+
+.. currentmodule:: spreg
+
+.. autoclass:: TSLS_Regimes
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~TSLS_Regimes.__init__
+
+
+
+
+
+ .. rubric:: Attributes
+
+ .. autosummary::
+
+ ~TSLS_Regimes.mean_y
+ ~TSLS_Regimes.pfora1a2
+ ~TSLS_Regimes.sig2n
+ ~TSLS_Regimes.sig2n_k
+ ~TSLS_Regimes.std_y
+ ~TSLS_Regimes.utu
+ ~TSLS_Regimes.vm
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.ThreeSLS.rst.txt b/_sources/generated/spreg.ThreeSLS.rst.txt
new file mode 100644
index 00000000..9ccd6065
--- /dev/null
+++ b/_sources/generated/spreg.ThreeSLS.rst.txt
@@ -0,0 +1,22 @@
+ο»Ώspreg.ThreeSLS
+==============
+
+.. currentmodule:: spreg
+
+.. autoclass:: ThreeSLS
+
+
+ .. automethod:: __init__
+
+
+ .. rubric:: Methods
+
+ .. autosummary::
+
+ ~ThreeSLS.__init__
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_sources/generated/spreg.akaike.rst.txt b/_sources/generated/spreg.akaike.rst.txt
new file mode 100644
index 00000000..721011c1
--- /dev/null
+++ b/_sources/generated/spreg.akaike.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.akaike
+============
+
+.. currentmodule:: spreg
+
+.. autofunction:: akaike
\ No newline at end of file
diff --git a/_sources/generated/spreg.ar2.rst.txt b/_sources/generated/spreg.ar2.rst.txt
new file mode 100644
index 00000000..13b026bb
--- /dev/null
+++ b/_sources/generated/spreg.ar2.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.ar2
+=========
+
+.. currentmodule:: spreg
+
+.. autofunction:: ar2
\ No newline at end of file
diff --git a/_sources/generated/spreg.breusch_pagan.rst.txt b/_sources/generated/spreg.breusch_pagan.rst.txt
new file mode 100644
index 00000000..e04ada15
--- /dev/null
+++ b/_sources/generated/spreg.breusch_pagan.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.breusch\_pagan
+====================
+
+.. currentmodule:: spreg
+
+.. autofunction:: breusch_pagan
\ No newline at end of file
diff --git a/_sources/generated/spreg.condition_index.rst.txt b/_sources/generated/spreg.condition_index.rst.txt
new file mode 100644
index 00000000..0c98e51b
--- /dev/null
+++ b/_sources/generated/spreg.condition_index.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.condition\_index
+======================
+
+.. currentmodule:: spreg
+
+.. autofunction:: condition_index
\ No newline at end of file
diff --git a/_sources/generated/spreg.constant_check.rst.txt b/_sources/generated/spreg.constant_check.rst.txt
new file mode 100644
index 00000000..ad4fcdbf
--- /dev/null
+++ b/_sources/generated/spreg.constant_check.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.constant\_check
+=====================
+
+.. currentmodule:: spreg
+
+.. autofunction:: constant_check
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_errproc.rst.txt b/_sources/generated/spreg.dgp.dgp_errproc.rst.txt
new file mode 100644
index 00000000..918e089b
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_errproc.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_errproc
+======================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_errproc
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_gns.rst.txt b/_sources/generated/spreg.dgp.dgp_gns.rst.txt
new file mode 100644
index 00000000..8eaf9fa0
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_gns.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_gns
+==================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_gns
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_lag.rst.txt b/_sources/generated/spreg.dgp.dgp_lag.rst.txt
new file mode 100644
index 00000000..e3086298
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_lag.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_lag
+==================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_lag
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_lagerr.rst.txt b/_sources/generated/spreg.dgp.dgp_lagerr.rst.txt
new file mode 100644
index 00000000..dfb2c3e2
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_lagerr.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_lagerr
+=====================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_lagerr
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_mess.rst.txt b/_sources/generated/spreg.dgp.dgp_mess.rst.txt
new file mode 100644
index 00000000..f099306f
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_mess.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_mess
+===================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_mess
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_ols.rst.txt b/_sources/generated/spreg.dgp.dgp_ols.rst.txt
new file mode 100644
index 00000000..38e0e854
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_ols.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_ols
+==================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_ols
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_probit.rst.txt b/_sources/generated/spreg.dgp.dgp_probit.rst.txt
new file mode 100644
index 00000000..64b0f955
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_probit.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_probit
+=====================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_probit
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_slx.rst.txt b/_sources/generated/spreg.dgp.dgp_slx.rst.txt
new file mode 100644
index 00000000..c2ac9ffc
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_slx.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_slx
+==================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_slx
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_slxerror.rst.txt b/_sources/generated/spreg.dgp.dgp_slxerror.rst.txt
new file mode 100644
index 00000000..9ebecf24
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_slxerror.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_slxerror
+=======================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_slxerror
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_spdurbin.rst.txt b/_sources/generated/spreg.dgp.dgp_spdurbin.rst.txt
new file mode 100644
index 00000000..d9f0d0e6
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_spdurbin.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_spdurbin
+=======================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_spdurbin
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.dgp_sperror.rst.txt b/_sources/generated/spreg.dgp.dgp_sperror.rst.txt
new file mode 100644
index 00000000..6dc6788a
--- /dev/null
+++ b/_sources/generated/spreg.dgp.dgp_sperror.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.dgp\_sperror
+======================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: dgp_sperror
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.make_bin.rst.txt b/_sources/generated/spreg.dgp.make_bin.rst.txt
new file mode 100644
index 00000000..7771a9f7
--- /dev/null
+++ b/_sources/generated/spreg.dgp.make_bin.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.make\_bin
+===================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: make_bin
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.make_error.rst.txt b/_sources/generated/spreg.dgp.make_error.rst.txt
new file mode 100644
index 00000000..6bbd6907
--- /dev/null
+++ b/_sources/generated/spreg.dgp.make_error.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.make\_error
+=====================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: make_error
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.make_heterror.rst.txt b/_sources/generated/spreg.dgp.make_heterror.rst.txt
new file mode 100644
index 00000000..81ade980
--- /dev/null
+++ b/_sources/generated/spreg.dgp.make_heterror.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.make\_heterror
+========================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: make_heterror
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.make_vmult.rst.txt b/_sources/generated/spreg.dgp.make_vmult.rst.txt
new file mode 100644
index 00000000..f1eef4c9
--- /dev/null
+++ b/_sources/generated/spreg.dgp.make_vmult.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.make\_vmult
+=====================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: make_vmult
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.make_wx.rst.txt b/_sources/generated/spreg.dgp.make_wx.rst.txt
new file mode 100644
index 00000000..b727a10c
--- /dev/null
+++ b/_sources/generated/spreg.dgp.make_wx.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.make\_wx
+==================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: make_wx
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.make_wxg.rst.txt b/_sources/generated/spreg.dgp.make_wxg.rst.txt
new file mode 100644
index 00000000..f547033d
--- /dev/null
+++ b/_sources/generated/spreg.dgp.make_wxg.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.make\_wxg
+===================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: make_wxg
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.make_x.rst.txt b/_sources/generated/spreg.dgp.make_x.rst.txt
new file mode 100644
index 00000000..e309dc48
--- /dev/null
+++ b/_sources/generated/spreg.dgp.make_x.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.make\_x
+=================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: make_x
\ No newline at end of file
diff --git a/_sources/generated/spreg.dgp.make_xb.rst.txt b/_sources/generated/spreg.dgp.make_xb.rst.txt
new file mode 100644
index 00000000..a4091e53
--- /dev/null
+++ b/_sources/generated/spreg.dgp.make_xb.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dgp.make\_xb
+==================
+
+.. currentmodule:: spreg.dgp
+
+.. autofunction:: make_xb
\ No newline at end of file
diff --git a/_sources/generated/spreg.dwh.rst.txt b/_sources/generated/spreg.dwh.rst.txt
new file mode 100644
index 00000000..035ed868
--- /dev/null
+++ b/_sources/generated/spreg.dwh.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.dwh
+=========
+
+.. currentmodule:: spreg
+
+.. autofunction:: dwh
\ No newline at end of file
diff --git a/_sources/generated/spreg.f_stat.rst.txt b/_sources/generated/spreg.f_stat.rst.txt
new file mode 100644
index 00000000..5ed721ac
--- /dev/null
+++ b/_sources/generated/spreg.f_stat.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.f\_stat
+=============
+
+.. currentmodule:: spreg
+
+.. autofunction:: f_stat
\ No newline at end of file
diff --git a/_sources/generated/spreg.jarque_bera.rst.txt b/_sources/generated/spreg.jarque_bera.rst.txt
new file mode 100644
index 00000000..9bca1990
--- /dev/null
+++ b/_sources/generated/spreg.jarque_bera.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.jarque\_bera
+==================
+
+.. currentmodule:: spreg
+
+.. autofunction:: jarque_bera
\ No newline at end of file
diff --git a/_sources/generated/spreg.koenker_bassett.rst.txt b/_sources/generated/spreg.koenker_bassett.rst.txt
new file mode 100644
index 00000000..4b682575
--- /dev/null
+++ b/_sources/generated/spreg.koenker_bassett.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.koenker\_bassett
+======================
+
+.. currentmodule:: spreg
+
+.. autofunction:: koenker_bassett
\ No newline at end of file
diff --git a/_sources/generated/spreg.lam_setp.rst.txt b/_sources/generated/spreg.lam_setp.rst.txt
new file mode 100644
index 00000000..6ae82f98
--- /dev/null
+++ b/_sources/generated/spreg.lam_setp.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.lam\_setp
+===============
+
+.. currentmodule:: spreg
+
+.. autofunction:: lam_setp
\ No newline at end of file
diff --git a/_sources/generated/spreg.likratiotest.rst.txt b/_sources/generated/spreg.likratiotest.rst.txt
new file mode 100644
index 00000000..37937efd
--- /dev/null
+++ b/_sources/generated/spreg.likratiotest.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.likratiotest
+==================
+
+.. currentmodule:: spreg
+
+.. autofunction:: likratiotest
\ No newline at end of file
diff --git a/_sources/generated/spreg.log_likelihood.rst.txt b/_sources/generated/spreg.log_likelihood.rst.txt
new file mode 100644
index 00000000..94eaedd5
--- /dev/null
+++ b/_sources/generated/spreg.log_likelihood.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.log\_likelihood
+=====================
+
+.. currentmodule:: spreg
+
+.. autofunction:: log_likelihood
\ No newline at end of file
diff --git a/_sources/generated/spreg.panel_Hausman.rst.txt b/_sources/generated/spreg.panel_Hausman.rst.txt
new file mode 100644
index 00000000..a89f25a7
--- /dev/null
+++ b/_sources/generated/spreg.panel_Hausman.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.panel\_Hausman
+====================
+
+.. currentmodule:: spreg
+
+.. autofunction:: panel_Hausman
\ No newline at end of file
diff --git a/_sources/generated/spreg.panel_LMerror.rst.txt b/_sources/generated/spreg.panel_LMerror.rst.txt
new file mode 100644
index 00000000..17ec1f8d
--- /dev/null
+++ b/_sources/generated/spreg.panel_LMerror.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.panel\_LMerror
+====================
+
+.. currentmodule:: spreg
+
+.. autofunction:: panel_LMerror
\ No newline at end of file
diff --git a/_sources/generated/spreg.panel_LMlag.rst.txt b/_sources/generated/spreg.panel_LMlag.rst.txt
new file mode 100644
index 00000000..695c15bf
--- /dev/null
+++ b/_sources/generated/spreg.panel_LMlag.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.panel\_LMlag
+==================
+
+.. currentmodule:: spreg
+
+.. autofunction:: panel_LMlag
\ No newline at end of file
diff --git a/_sources/generated/spreg.panel_rLMerror.rst.txt b/_sources/generated/spreg.panel_rLMerror.rst.txt
new file mode 100644
index 00000000..c851a31a
--- /dev/null
+++ b/_sources/generated/spreg.panel_rLMerror.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.panel\_rLMerror
+=====================
+
+.. currentmodule:: spreg
+
+.. autofunction:: panel_rLMerror
\ No newline at end of file
diff --git a/_sources/generated/spreg.panel_rLMlag.rst.txt b/_sources/generated/spreg.panel_rLMlag.rst.txt
new file mode 100644
index 00000000..2a908232
--- /dev/null
+++ b/_sources/generated/spreg.panel_rLMlag.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.panel\_rLMlag
+===================
+
+.. currentmodule:: spreg
+
+.. autofunction:: panel_rLMlag
\ No newline at end of file
diff --git a/_sources/generated/spreg.r2.rst.txt b/_sources/generated/spreg.r2.rst.txt
new file mode 100644
index 00000000..1c2a0f12
--- /dev/null
+++ b/_sources/generated/spreg.r2.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.r2
+========
+
+.. currentmodule:: spreg
+
+.. autofunction:: r2
\ No newline at end of file
diff --git a/_sources/generated/spreg.schwarz.rst.txt b/_sources/generated/spreg.schwarz.rst.txt
new file mode 100644
index 00000000..c361c012
--- /dev/null
+++ b/_sources/generated/spreg.schwarz.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.schwarz
+=============
+
+.. currentmodule:: spreg
+
+.. autofunction:: schwarz
\ No newline at end of file
diff --git a/_sources/generated/spreg.se_betas.rst.txt b/_sources/generated/spreg.se_betas.rst.txt
new file mode 100644
index 00000000..a1667a57
--- /dev/null
+++ b/_sources/generated/spreg.se_betas.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.se\_betas
+===============
+
+.. currentmodule:: spreg
+
+.. autofunction:: se_betas
\ No newline at end of file
diff --git a/_sources/generated/spreg.spsearch.gets_gns.rst.txt b/_sources/generated/spreg.spsearch.gets_gns.rst.txt
new file mode 100644
index 00000000..39510852
--- /dev/null
+++ b/_sources/generated/spreg.spsearch.gets_gns.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.spsearch.gets\_gns
+========================
+
+.. currentmodule:: spreg.spsearch
+
+.. autofunction:: gets_gns
\ No newline at end of file
diff --git a/_sources/generated/spreg.spsearch.gets_sdm.rst.txt b/_sources/generated/spreg.spsearch.gets_sdm.rst.txt
new file mode 100644
index 00000000..edd4b268
--- /dev/null
+++ b/_sources/generated/spreg.spsearch.gets_sdm.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.spsearch.gets\_sdm
+========================
+
+.. currentmodule:: spreg.spsearch
+
+.. autofunction:: gets_sdm
\ No newline at end of file
diff --git a/_sources/generated/spreg.spsearch.stge_classic.rst.txt b/_sources/generated/spreg.spsearch.stge_classic.rst.txt
new file mode 100644
index 00000000..05cc5476
--- /dev/null
+++ b/_sources/generated/spreg.spsearch.stge_classic.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.spsearch.stge\_classic
+============================
+
+.. currentmodule:: spreg.spsearch
+
+.. autofunction:: stge_classic
\ No newline at end of file
diff --git a/_sources/generated/spreg.spsearch.stge_kb.rst.txt b/_sources/generated/spreg.spsearch.stge_kb.rst.txt
new file mode 100644
index 00000000..82686fed
--- /dev/null
+++ b/_sources/generated/spreg.spsearch.stge_kb.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.spsearch.stge\_kb
+=======================
+
+.. currentmodule:: spreg.spsearch
+
+.. autofunction:: stge_kb
\ No newline at end of file
diff --git a/_sources/generated/spreg.spsearch.stge_pre.rst.txt b/_sources/generated/spreg.spsearch.stge_pre.rst.txt
new file mode 100644
index 00000000..7a655cf7
--- /dev/null
+++ b/_sources/generated/spreg.spsearch.stge_pre.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.spsearch.stge\_pre
+========================
+
+.. currentmodule:: spreg.spsearch
+
+.. autofunction:: stge_pre
\ No newline at end of file
diff --git a/_sources/generated/spreg.surLMe.rst.txt b/_sources/generated/spreg.surLMe.rst.txt
new file mode 100644
index 00000000..7cede27a
--- /dev/null
+++ b/_sources/generated/spreg.surLMe.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.surLMe
+============
+
+.. currentmodule:: spreg
+
+.. autofunction:: surLMe
\ No newline at end of file
diff --git a/_sources/generated/spreg.surLMlag.rst.txt b/_sources/generated/spreg.surLMlag.rst.txt
new file mode 100644
index 00000000..48470bb7
--- /dev/null
+++ b/_sources/generated/spreg.surLMlag.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.surLMlag
+==============
+
+.. currentmodule:: spreg
+
+.. autofunction:: surLMlag
\ No newline at end of file
diff --git a/_sources/generated/spreg.sur_lmtest.rst.txt b/_sources/generated/spreg.sur_lmtest.rst.txt
new file mode 100644
index 00000000..15aa811a
--- /dev/null
+++ b/_sources/generated/spreg.sur_lmtest.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.sur\_lmtest
+=================
+
+.. currentmodule:: spreg
+
+.. autofunction:: sur_lmtest
\ No newline at end of file
diff --git a/_sources/generated/spreg.sur_lrtest.rst.txt b/_sources/generated/spreg.sur_lrtest.rst.txt
new file mode 100644
index 00000000..3956ee6c
--- /dev/null
+++ b/_sources/generated/spreg.sur_lrtest.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.sur\_lrtest
+=================
+
+.. currentmodule:: spreg
+
+.. autofunction:: sur_lrtest
\ No newline at end of file
diff --git a/_sources/generated/spreg.sur_setp.rst.txt b/_sources/generated/spreg.sur_setp.rst.txt
new file mode 100644
index 00000000..0b55a797
--- /dev/null
+++ b/_sources/generated/spreg.sur_setp.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.sur\_setp
+===============
+
+.. currentmodule:: spreg
+
+.. autofunction:: sur_setp
\ No newline at end of file
diff --git a/_sources/generated/spreg.t_stat.rst.txt b/_sources/generated/spreg.t_stat.rst.txt
new file mode 100644
index 00000000..ff507b7b
--- /dev/null
+++ b/_sources/generated/spreg.t_stat.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.t\_stat
+=============
+
+.. currentmodule:: spreg
+
+.. autofunction:: t_stat
\ No newline at end of file
diff --git a/_sources/generated/spreg.vif.rst.txt b/_sources/generated/spreg.vif.rst.txt
new file mode 100644
index 00000000..3be42ae7
--- /dev/null
+++ b/_sources/generated/spreg.vif.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.vif
+=========
+
+.. currentmodule:: spreg
+
+.. autofunction:: vif
\ No newline at end of file
diff --git a/_sources/generated/spreg.white.rst.txt b/_sources/generated/spreg.white.rst.txt
new file mode 100644
index 00000000..dd11eb7a
--- /dev/null
+++ b/_sources/generated/spreg.white.rst.txt
@@ -0,0 +1,6 @@
+ο»Ώspreg.white
+===========
+
+.. currentmodule:: spreg
+
+.. autofunction:: white
\ No newline at end of file
diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt
index 68ae8673..4da4604f 100644
--- a/_sources/index.rst.txt
+++ b/_sources/index.rst.txt
@@ -1,20 +1,27 @@
-.. No Errors Test Project documentation master file, created by
- sphinx-quickstart on Fri Aug 30 17:07:56 2019.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
+.. documentation master file
-Welcome to No Errors Test Project's documentation!
-==================================================
+Spatial Regression Models (spreg)
+=================================
-.. toctree::
- :maxdepth: 2
- :caption: Hello World!
+`spreg`, short for "spatial regression," is a python package to estimate simultaneous autoregressive spatial regression models. These models are useful when modeling processes where observations interact with one another. For more information on these models, consult the Spatial Regression short course by Luc Anselin (Spring, 2017), with the Center for Spatial Data Science at the University of Chicago:
+
+----
+
+.. raw:: html
+
+
+
+
+.. toctree::
+ :hidden:
+ :maxdepth: 3
+ :caption: Contents:
+ Installation
+ Tutorials
+ API
+ References
-Indices and tables
-==================
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
+.. _PySAL: https://github.com/pysal/pysal
diff --git a/_sources/installation.rst.txt b/_sources/installation.rst.txt
new file mode 100644
index 00000000..854ee3fe
--- /dev/null
+++ b/_sources/installation.rst.txt
@@ -0,0 +1,14 @@
+.. Installation
+
+Installation
+============
+
+spreg is installable using the Python Package Manager, `pip`. To install::
+
+ pip install spreg
+
+Further, all of the stable functionality is *also* available in PySAL, the
+Python Spatial Analysis Library. PySAL can be installed using `pip` or `conda`::
+
+ pip install pysal #or
+ conda install pysal
diff --git a/_sources/notebooks/10_specification_tests_properties.ipynb.txt b/_sources/notebooks/10_specification_tests_properties.ipynb.txt
new file mode 100644
index 00000000..4ff36e3f
--- /dev/null
+++ b/_sources/notebooks/10_specification_tests_properties.ipynb.txt
@@ -0,0 +1,1066 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# Specification Tests - Properties\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/11/2024)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, a closer look is taken at the properties of the various specification tests for spatial effects. This is carried out by means of a series of simulation experiments on data generated under the null hypothesis of no spatial effects as well as under various alternatives. This provides insight into the distribution of the test statistics under the null and their relative power against various alternatives.\n",
+ "\n",
+ "### Prerequisites\n",
+ "\n",
+ "Familiarity with OLS estimation in *spreg* is assumed, as covered in the *OLS notebook* and the *Specification Tests* notebook. For the graphs, it may be useful to have some familiarity with *matplotlib* and the `plot` functionality of *pandas*, although to just replicate the graphs used here, that is not really needed."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6494b68c",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The main module for spatial regression in PySAL is *spreg*. In addition, *libpysal* is needed for spatial weights manipulation, and *pandas* for data frame manipulation. In these exercises, *geopandas* is not needed. In order to get nicer looking graphs, *matplotlib.pyplot* is imported as well, although this is not critical. In addition, the module *time* is used for timing experiments (optional).\n",
+ "\n",
+ "As before, only the relevant parts of *libpysal* and *spreg* are imported."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e398e42f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import time\n",
+ "import libpysal.weights as weights\n",
+ "from spreg import OLS, make_x, make_xb, make_wx, make_wxg, \\\n",
+ " make_error, dgp_ols, dgp_lag, dgp_sperror, dgp_slx\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from numpy:\n",
+ " - random.default_rng\n",
+ " - rng.chisquare\n",
+ " - zeros\n",
+ " - array\n",
+ " - reshape\n",
+ " - hstack\n",
+ "\n",
+ "- from pandas:\n",
+ " - DataFrame\n",
+ " - describe\n",
+ " - plot\n",
+ " \n",
+ "- from matplotlib.pyplot:\n",
+ " - show\n",
+ " \n",
+ "- from libpysal:\n",
+ " - weights.lat2W\n",
+ " - w.transform\n",
+ " - w.n\n",
+ " \n",
+ "- from spreg:\n",
+ " - OLS\n",
+ " - make_x\n",
+ " - make_xb\n",
+ " - make_wx\n",
+ " - make_wxg\n",
+ " - make_error\n",
+ " - dgp_ols\n",
+ " - dgp_lag\n",
+ " - dgp_sperror\n",
+ " - dgp_slx\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "67da216d",
+ "metadata": {},
+ "source": [
+ "### Files and Variables\n",
+ "\n",
+ "In this notebook, no actual data are used, since the data sets will be created by means of simulation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a0f82891",
+ "metadata": {},
+ "source": [
+ "## Model Parameters and Variables\n",
+ "\n",
+ "The various model parameters are set here, so that it is easy to replicate the experiments for different sample sizes and coefficient values.\n",
+ "\n",
+ "- gridx: the number of cells in the horizontal dimension of a regular lattice of dimension gridx x gridy \n",
+ "- gridy: the number of cells in the vertical dimension of a regular lattice of dimension gridx x gridy \n",
+ "- b1: a list with regression parameters (includes a coefficient for the constant term as the first element) \n",
+ "- k: length of b1 less one (no constant term counted)\n",
+ "- rndseed: the random seed to ensure reproducibility \n",
+ "- reps: the number of replications \n",
+ "- rhovals: a list with spatial autoregressive coefficients $\\rho$ for the lag variables Wy \n",
+ "- lamvals: a list with spatial coefficients $\\lambda$ for the error lag variables We \n",
+ "- gamvals: a list with coefficients for the SLX variables (WX) \n",
+ "- gamma: coefficient for WX in the Spatial Durbin Model \n",
+ "\n",
+ "- w: queen contiguity spatial weights\n",
+ "- n: number of observations\n",
+ "- p_value: critical value to be used for tests\n",
+ "- x1: x values (not including constant)\n",
+ "- xb1: $X \\beta$\n",
+ "- wx1: $WX$"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "16ef7c91",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# grid layout and weights\n",
+ "gridx = 20\n",
+ "gridy = 20\n",
+ "w = weights.lat2W(gridx,gridy,rook=False) \n",
+ "w.transform = 'r'\n",
+ "n = w.n\n",
+ "\n",
+ "# model coefficient values\n",
+ "b1 = [1, 1, 1, 1]\n",
+ "k = len(b1)-1\n",
+ "rhovals = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, \n",
+ " 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]\n",
+ "lamvals = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, \n",
+ " 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]\n",
+ "gamvals = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, \n",
+ " 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]\n",
+ "gamma = 0.5\n",
+ "\n",
+ "# simulation parameters\n",
+ "rndseed = 123456789\n",
+ "reps = 1000\n",
+ "p_value = 0.05\n",
+ "\n",
+ "# Create X\n",
+ "rng=np.random.default_rng(seed=rndseed) # set seed for X\n",
+ "xx = make_x(rng,n*k,mu=[0],varu=[6],method=\"uniform\")\n",
+ "x1 = np.reshape(xx,(n,k))\n",
+ "xb1 = make_xb(x1,b1) # no constant in x1, but a coefficient for the constant in b1\n",
+ "wx1 = make_wx(x1,w) # default first order no constant"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7e8cd31f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Summary of Simulation Design Parameters\")\n",
+ "print(\"grid size: \",gridx,\" x \",gridy)\n",
+ "print(\"n: \",n,\" k: \",k)\n",
+ "print(\"betas: \",b1)\n",
+ "print(\"random seed: \",rndseed)\n",
+ "print(\"replications: \",reps)\n",
+ "print(\"p-value: \",p_value)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5e1a81dd",
+ "metadata": {},
+ "source": [
+ "## Distribution Under the Null"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5e90ce6c",
+ "metadata": {},
+ "source": [
+ "The distribution under the null is obtained by simulating **reps** data sets under the null of standard normal errors. The reference distributions for Chi-squared are obtained by means of `rng.chisquare` with the appropriate degrees of freedom. For the LMWX and LMSDM tests, these depend on the number of variables in the X matrix ($k$). \n",
+ "\n",
+ "The values for the test statistics are taken from the OLS regression object with the arguments `spat_diag=True` and `moran=True`.\n",
+ "\n",
+ "All the results are collected into a *pandas* dataframe that is then used to compute the descriptive statistics by means of `describe` and to create the various plots."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f62d9193",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "\n",
+ "# all distributions under the null\n",
+ "alltests = [\"N01\",\"Chi2-1\",\"Chi2-2\",\"Chi2-k\",\"Chi2-kr\",\n",
+ " \"Moran\",\"LM-Lag\",\"LM-Error\",\"LMWX\",\n",
+ " \"LMSARER\",\"LMSDM\"]\n",
+ "# initialize\n",
+ "best = np.zeros((reps,len(alltests)))\n",
+ "rng=np.random.default_rng(seed=rndseed)\n",
+ "# reference distributions as random number draws\n",
+ "# standard normal\n",
+ "nn = make_error(rng,reps)\n",
+ "best[:,0] = nn[:,0]\n",
+ "# chi-squared\n",
+ "for j in range(2):\n",
+ " df = j+1\n",
+ " best[:,df] = rng.chisquare(df,reps)\n",
+ "for dff in [k,k+1]: # d.f. for LMWX and LMSDM depends on k\n",
+ " df = df + 1\n",
+ " best[:,df] = rng.chisquare(dff,reps)\n",
+ " \n",
+ "# replications\n",
+ "for i in range(reps):\n",
+ " u = make_error(rng,n)\n",
+ " \n",
+ " y = dgp_ols(u,xb1)\n",
+ " reg = OLS(y,x1,w=w,spat_diag=True,moran=True)\n",
+ " testres = [reg.moran_res[1],reg.lm_lag[0],reg.lm_error[0],reg.lm_wx[0],\n",
+ " reg.lm_sarma[0],reg.lm_spdurbin[0]]\n",
+ " for jj in range(len(testres)):\n",
+ " best[i,jj+5] = testres[jj]\n",
+ "\n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)\n",
+ "\n",
+ "results = pd.DataFrame(best,columns=alltests)\n",
+ "results.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c1d56cf2",
+ "metadata": {},
+ "source": [
+ "The most relevant characteristics are the mean and the standard deviation. For the draws from the theoretical distributions, the means are roughly what would be expected, i.e., around 0 for the standard normal, and around the degrees of freedom for the Chi-squared distributions. The standard deviation for the standard normal should be around 1, whereas for the Chi-squared distributions it should be roughly the square root of twice the degrees of freedom. For example, for 1 degree of freedom, it should be around 1.41 (here, 1.26), for two degrees of freedom, around 2 (here, 2.01), etc. \n",
+ "\n",
+ "The mean for the standardized z-value for Moran's I is -0.015 with a standard deviation of 0.986, close to the moments of a standard normal distribution. The mean of the LM statistics is roughly what would be expected.\n",
+ "\n",
+ "A closer look at the distribution of the test statistics under the null is obtained in a graph."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4ee0ebe0",
+ "metadata": {},
+ "source": [
+ "### Moran's I"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "42c4083b",
+ "metadata": {},
+ "source": [
+ "The `plot` functionality of a *pandas* dataframe is used to create a density plot for Moran's I. It is contrasted with a density plot from simulated standard normal variates. The argument `kind=\"kde\"` is used to obtain a density plot. The columns in the data frame are **N01** and **Moran**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f0ba0b8b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results.plot(y=[\"N01\",\"Moran\"],kind=\"kde\",\n",
+ " title=\"Distribution under Null\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cfed695f",
+ "metadata": {},
+ "source": [
+ "The result illustrates how Moran's I standardized z-value closely tracks the standard normal distribution under the null."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f477764b",
+ "metadata": {},
+ "source": [
+ "### LM-Lag and LM-Error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "379ec31d",
+ "metadata": {},
+ "source": [
+ "For LM-Lag and LM-Error, the reference distribution is a Chi-squared distribution with one degree of freedom. Since the `kde` interpolation results in values less than 0, which is impossible for Chi-squared, the graph is truncated at 0 by means of `xlim=((0,None))`. This still results in a slight bump at zero for one degree of freedom, where in the strict sense there should not be one. For the purposes here, this does not matter.\n",
+ "\n",
+ "The columns in the data frame are **Chi2-1**, **LM-Lag** and **LM-Error**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1a59392c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results.plot(y=[\"Chi2-1\",\"LM-Lag\",\"LM-Error\"],kind=\"kde\",\n",
+ " title=\"Distribution under Null\",xlim=((0,None)))\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9f689bd",
+ "metadata": {},
+ "source": [
+ "In contrast to the results for Moran's I, the graphs closely follow the pattern for the $\\chi^2(1)$ distribution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ee163a0b",
+ "metadata": {},
+ "source": [
+ "### LM-SARERROR"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9b665a13",
+ "metadata": {},
+ "source": [
+ "In the same way, the distribution of LM-SARERROR is compared to a $\\chi^2(2)$ distribution by selecting the proper columns in the data frame, i.e., **Chi2-2** and **LMSARER**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a65641d0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results.plot(y=[\"Chi2-2\",\"LMSARER\"],kind=\"kde\",\n",
+ " title=\"Distribution under Null\",xlim=((0,None)))\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "484123ff",
+ "metadata": {},
+ "source": [
+ "The two plots track closely."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd4a6383",
+ "metadata": {},
+ "source": [
+ "### LM-WX"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91ee2d39",
+ "metadata": {},
+ "source": [
+ "The plot for LM-WX is again obtained by selecting the proper columns. The reference distribution is now $\\chi^2(k)$, where $k$ is the number of explanatory variables, not counting the constant. In the example, this is 3. The corresponding columns in the data frame are **Chi2-k** and **LMWX**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f5a93eed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results.plot(y=[\"Chi2-k\",\"LMWX\"],kind=\"kde\",\n",
+ " title=\"Distribution under Null\",xlim=((0,None)))\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fcb021d",
+ "metadata": {},
+ "source": [
+ "The mode obtained for the test statistic is somewhat smaller than its expected value, but otherwise the two curves track closely."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "192096da",
+ "metadata": {},
+ "source": [
+ "### LM-Spatial Durbin"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "37b65e4f",
+ "metadata": {},
+ "source": [
+ "For the LM-Spatial Durbin test, the theoretical distribution has $k+1$ degrees of freedom, which includes a degree for the spatial autoregressive parameter as well as the explanatory variables. The data frame columns are **Chi2-kr** and **LMSDM**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "65c57720",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results.plot(y=[\"Chi2-kr\",\"LMSDM\"],kind=\"kde\",\n",
+ " title=\"Distribution under Null\",xlim=((0,None)))\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "847cac23",
+ "metadata": {},
+ "source": [
+ "The same general pattern is obtained as for LM-WX, i.e., the mode is somewhat lower than the expected theoretical value, but otherwise the two graphs track closely."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f86fce8",
+ "metadata": {},
+ "source": [
+ "## Power Functions - Lag Alternative"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "27f08fe5-c532-4cb0-baf8-e2ea3e75ca04",
+ "metadata": {},
+ "source": [
+ "Power functions show the rejection percentage of a test statistic for a given p-value at different parameter values for the alternative hypothesis. For the Lag alternative, this is obtained by setting the value for $\\rho$ and generating a vector for $y$ using `dgp_lag`. This dependent variable is then used in a standard OLS regression. For each replication, the p-values for the various spatial diagnostics are extracted from the regression object and compared to the critical value (**p_value**).\n",
+ "\n",
+ "The comparison is turned into a 0-1 variable for each replication. Finally, the `mean` over all replications is the rejection frequency for each test for the given value of $\\rho$. The resulting array is turned into a *pandas* data frame to `plot` the corresponding power functions.\n",
+ "\n",
+ "Depending on the hardware, this simulation can take a few minutes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "de366884",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pvals = [\"Moran\",\"LM-Lag\",\"LM-Error\",\"LMWX\",\n",
+ " \"LMSARER\",\"LMSDM\",\"RLM-Lag\",\"RLM-Error\",\"RLMWX\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ad75e89",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "powvals = np.zeros((len(rhovals),len(pvals)+1))\n",
+ "powvals[:,0] = rhovals\n",
+ "for r in range(len(rhovals)): \n",
+ " best = np.zeros((reps,len(pvals)))\n",
+ " rng=np.random.default_rng(seed=rndseed)\n",
+ " for i in range(reps):\n",
+ " u = make_error(rng,n) \n",
+ " y = dgp_lag(u,xb1,w,rho=rhovals[r])\n",
+ " reg = OLS(y,x1,w,spat_diag=True,moran=True)\n",
+ " testp = [reg.moran_res[2],reg.lm_lag[1],reg.lm_error[1],\n",
+ " reg.lm_wx[1],reg.lm_sarma[1],\n",
+ " reg.lm_spdurbin[1],reg.rlm_lag[1],\n",
+ " reg.rlm_error[1],reg.rlm_wx[1]]\n",
+ " best[i,:] = testp\n",
+ " \n",
+ " bestp = (best < p_value) * 1 # significant \n",
+ " mm = bestp.mean(axis=0)\n",
+ " powvals[r,1:]= mm\n",
+ " \n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)\n",
+ "\n",
+ "powresult = pd.DataFrame(powvals,columns=[\"rho\"]+pvals)\n",
+ "\n",
+ "print(\"Test Power for different Rho\")\n",
+ "print(powresult)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b74d467d",
+ "metadata": {},
+ "source": [
+ "The results data frame shows how the rejection frequency for the various spatial diagnostics changes with the value of the spatial autoregressive parameter $\\rho$.\n",
+ "\n",
+ "The relative shape of the associated power curves can be compared by means of the `plot` functionality of the *pandas* data frame. For the current purposes, only a rudimentary graph is shown. Fancier versions can be obtained by means of the full functionality of *matplotlib*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "620eb16e",
+ "metadata": {},
+ "source": [
+ "### Single alternative tests for Lag and Error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c202611d",
+ "metadata": {},
+ "source": [
+ "The first comparison is between the traditional LM-Lag and LM-Error tests, their robust forms and Moran's I."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1a87d7eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LM-Lag\",\"RLM-Lag\",\"Moran\",\"LM-Error\",\"RLM-Error\"]\n",
+ "powresult.plot(x=\"rho\",y=testnames,\n",
+ " title=\" Lag Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3fc9a38f",
+ "metadata": {},
+ "source": [
+ "The power functions for LM-Lag and its robust form are the two left-most curves. They track each other very closely and achieve a 100% rejection rate for values of $\\rho$ as low as 0.2. Next most powerful are LM-Error and Moran's I, which achieve almost identical power with 100% rejection for $\\rho = 0.3$. Finally, while the robust form of LM-Error provides an effective correction for small values of $\\rho$, it too achieves 100% rejection rate, but for $\\rho = 0.4$. This illustrates the difficulty of identifying the proper alternative when the spatial autoregressive parameter is large."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ad1f306",
+ "metadata": {},
+ "source": [
+ "### Lag tests and test against WX"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a102155f",
+ "metadata": {},
+ "source": [
+ "A second comparison is between the LM-Lag test and its robust form and the LM test on WX and its robust form (robust to the presence of a spatial lag term)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3659d8ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LM-Lag\",\"RLM-Lag\",\"LMWX\",\"RLMWX\"]\n",
+ "powresult.plot(x=\"rho\",y=testnames,\n",
+ " title=\" Lag Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b1f44aed",
+ "metadata": {},
+ "source": [
+ "The power of the LM-WX test tracks that for LM-Lag and Robust LM-Lag very closely, with only slightly less power for the smallest values of $\\rho$. However, just as the for Lag tests, it reaches 100% rejection for $\\rho = 0.2$. In contrast, the robust form of the LM-WX test has much less power and only reaches the 100% rejection rate for $\\rho = 0.45$. This illustrates the effectiveness of the correction for small values of $\\rho$, i.e., for *local* alternatives."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bac55f9c",
+ "metadata": {},
+ "source": [
+ "### Lag tests and higher order tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3448acfd",
+ "metadata": {},
+ "source": [
+ "A final comparison is between the Lag tests and the higher order diagnostics, i.e., LM-SAR-Error and LM-SDM."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "28f7e828",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LM-Lag\",\"RLM-Lag\",\"LMSARER\",\"LMSDM\"]\n",
+ "powresult.plot(x=\"rho\",y=testnames,\n",
+ " title=\" Lag Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c1e2b862",
+ "metadata": {},
+ "source": [
+ "The results clearly illustrate the unfortunate property of the higher order tests to have strong power against the single parameter Lag alternative. Both LM-SARER and LM-SDM reach 100% rejection for $\\rho = 0.2$ and they have only slightly less power than the Lag tests for smaller values of $\\rho$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "84c3333f",
+ "metadata": {},
+ "source": [
+ "## Power Functions - Error Alternative"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "04fa1806",
+ "metadata": {},
+ "source": [
+ "The same approach can be taken to assess the relative power of the spatial diagnostics against an error alternative. The code is essentially the same as before, except that **lamvals** is used for the spatial parameter and `dgp_sperror` is used for the data generating process.\n",
+ "\n",
+ "A more pythonic solution would be to put all these operations in a function and pass the dgp as a function object to that function, but the current structure of the `dgp` module makes that difficult to generalize. \n",
+ "\n",
+ "The same spatial diagnostics as before are considered. The simulation can take quite a bit longer than for the Lag case."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9401c8ec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pvals = [\"Moran\",\"LM-Lag\",\"LM-Error\",\"LMWX\",\"LMSARER\",\"LMSDM\",\n",
+ " \"RLM-Lag\",\"RLM-Error\",\"RLMWX\"]\n",
+ "t0 = time.time()\n",
+ "powvals = np.zeros((len(lamvals),len(pvals)+1))\n",
+ "powvals[:,0] = lamvals\n",
+ "for r in range(len(lamvals)): \n",
+ " best = np.zeros((reps,len(pvals)))\n",
+ " rng=np.random.default_rng(seed=rndseed)\n",
+ " for i in range(reps):\n",
+ " u = make_error(rng,n) \n",
+ " y = dgp_sperror(u,xb1,w,lam=lamvals[r])\n",
+ " reg = OLS(y,x1,w,spat_diag=True,moran=True)\n",
+ " testp = [reg.moran_res[2],reg.lm_lag[1],reg.lm_error[1],\n",
+ " reg.lm_wx[1],reg.lm_sarma[1],\n",
+ " reg.lm_spdurbin[1],reg.rlm_lag[1],\n",
+ " reg.rlm_error[1],reg.rlm_wx[1]]\n",
+ " best[i,:] = testp\n",
+ " bestp = (best < p_value) * 1 # significant\n",
+ " mm = bestp.mean(axis=0)\n",
+ " powvals[r,1:]= mm\n",
+ "\n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)\n",
+ "\n",
+ "powresult = pd.DataFrame(powvals,columns=[\"lam\"]+pvals)\n",
+ "\n",
+ "print(\"Test Power for different Lambda\")\n",
+ "print(powresult)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20cf841a",
+ "metadata": {},
+ "source": [
+ "### Single alternative tests for Lag and Error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9fdc519",
+ "metadata": {},
+ "source": [
+ "As in the case of the Lag DGP, the first comparison is between the LM tests, their robust versions and Moran's I."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4b78e902",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LM-Lag\",\"RLM-Lag\",\"Moran\",\n",
+ " \"LM-Error\",\"RLM-Error\"]\n",
+ "powresult.plot(x=\"lam\",y=testnames,\n",
+ " title=\" Error Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bba1159f",
+ "metadata": {},
+ "source": [
+ "The left-most power curves are for Moran's I, LM-Error and Robust LM-Error, which a slight edge for Moran's I for the smaller values of $\\lambda$. Unlike the patterns observed for the Lag alternative, the 100% rejection rate is not reached until $\\lambda = 0.45$ for Moran's I and LM-Error, and $\\lambda = 0.5$ for Robust LM-Error, illustrating an overall lower power of these tests against the error alternative relative to the lag alternative. Also, LM-Lag has much less power against this alternative than LM-Error did against the Lag alternative. Its curve is well to the right of that for the specific error tests and only reaches the 100% rejection rate for $\\lambda = 0.85$. The Robust LM-Lag test has negligible power against the error alternative. Even for $\\lambda = 0.9$, its rejection rate is only 34%."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "62044597",
+ "metadata": {},
+ "source": [
+ "### Error tests and test against WX"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "42f9bcc2",
+ "metadata": {},
+ "source": [
+ "A second comparison is between the two LM tests against error and LM-WX and its robust form (robust against lag)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "be151f52",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LM-Error\",\"RLM-Error\",\"LMWX\",\n",
+ " \"RLMWX\"]\n",
+ "powresult.plot(x=\"lam\",y=testnames,\n",
+ " title=\" Error Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7a5f30cc",
+ "metadata": {},
+ "source": [
+ "As is to be expected, the LM-Error and Robust LM-Error have the highest power, but surprisingly, the Robust LM-WX has much higher power than LM-WX itself. Its power curve is consistently above that of LM-WX which contradicts the theoretical requirement that the robust form of a test should be smaller than its original value.\n",
+ "\n",
+ "The LM-WX test only reaches a 60.6% rejection rate for $\\lambda = 0.9$, whereas Robust LM-WX has a 100% rejection rate for $\\lambda = 0.5$, essentially the same as Robust LM-Error, although it remains below the two LM curves for all smaller values of $\\lambda$.\n",
+ "\n",
+ "This phenomenon illustrates the *two out of three* problem associated with the robust LM-tests, namely that they are constructed to take into account only one of potentially two types of misspecification. In this case, the correction of LM-WX is for the presence of a spatial Lag term, but the actual DGP is a spatial error model, which is no longer a *local* alternative, hence the strange behavior of the robust test."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "88a75771",
+ "metadata": {},
+ "source": [
+ "### Error tests and higher order tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b3e07985",
+ "metadata": {},
+ "source": [
+ "The final comparison is between LM-Error and its Robust form and the LM-SARER and LM-SDM tests."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "838ff5d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LM-Error\",\"RLM-Error\",\"LMSARER\",\n",
+ " \"LMSDM\"]\n",
+ "powresult.plot(x=\"lam\",y=testnames,\n",
+ " title=\" Error Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5910c02f",
+ "metadata": {},
+ "source": [
+ "As was the case for the Lag DGP, the LM-SARER test has strong power against the one-directional error alternative. It reaches 100\\% rejection rate for $\\lambda = 0.5$, the same as the robust LM-Error, although for the smaller values of $\\lambda$ its power curve is always slightly below those of the one-directional LM tests. \n",
+ "\n",
+ "More disturbingly are the results for a two-directional test against the spatial Durbin DGP (i.e., both $\\rho$ and $\\gamma$ non-zero), which obtains almost the same power as the other tests against the error DGP. In other words, even in the total absence of a SDM model, the LM-SDM test will point to that alternative when the true DGP is an error model. At some level, this may be expected, since SDM is equivalent to an error specification under the spatial common factor coefficient constraints. However, in practice, this result may be confusing. One would expect that after estimating a SDM, the common factor test would point to an error specification, but this is not the most efficient approach."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ac105c5",
+ "metadata": {},
+ "source": [
+ "## Power Functions - SLX Alternative"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80fcfec9",
+ "metadata": {},
+ "source": [
+ "A final analysis of the power of the various spatial diagnostics is when the alternative is an SLX model. The same approach is taken as for the other two cases, but this time the loop is over the values of $\\gamma$, the coefficients of the WX variables. For the sake of simplicity, these are taken to be the same for each spatially lagged explanatory variable.\n",
+ "\n",
+ "The code is again essentially the same as before, except that now the argument **wxg1** must be calculated for each value of $\\gamma$ and then passed to the `dgp_slx`function for the data generating process. The results are again collected in a data frame for visualization.\n",
+ "\n",
+ "This simulation takes slightly less time than the previous cases."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6748ac83",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pvals = [\"Moran\",\"LM-Lag\",\"LM-Error\",\"LMWX\",\"LMSARER\",\n",
+ " \"LMSDM\",\"RLM-Lag\",\"RLM-Error\",\"RLMWX\"]\n",
+ "t0 = time.time()\n",
+ "powvals = np.zeros((len(gamvals),len(pvals)+1))\n",
+ "powvals[:,0] = gamvals\n",
+ "for r in range(len(gamvals)):\n",
+ " g = gamvals[r]\n",
+ " gg = [g for i in b1[0:-1]] # create list of gamma values of the correct length\n",
+ " wxg1 = make_wxg(wx1,gg)\n",
+ " best = np.zeros((reps,len(pvals)))\n",
+ " rng=np.random.default_rng(seed=rndseed)\n",
+ " for i in range(reps):\n",
+ " u = make_error(rng,n) \n",
+ " y = dgp_slx(u,xb1,wxg1)\n",
+ " reg = OLS(y,x1,w,spat_diag=True,moran=True)\n",
+ " testp = [reg.moran_res[2],reg.lm_lag[1],reg.lm_error[1],reg.lm_wx[1],reg.lm_sarma[1],\n",
+ " reg.lm_spdurbin[1],reg.rlm_lag[1],reg.rlm_error[1],reg.rlm_wx[1]]\n",
+ " best[i,:] = testp\n",
+ " bestp = (best < p_value) * 1 # significant\n",
+ " mm = bestp.mean(axis=0)\n",
+ " powvals[r,1:]= mm\n",
+ "\n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)\n",
+ "\n",
+ "powresult = pd.DataFrame(powvals,columns=[\"gam\"]+pvals)\n",
+ "\n",
+ "print(\"Test Power for different Gamma\")\n",
+ "print(powresult)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "02d6001d",
+ "metadata": {},
+ "source": [
+ "### SLX tests and Moran's I"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "90d923c6",
+ "metadata": {},
+ "source": [
+ "As a first comparison, the power curves of LM-WX and its Robust version are plotted together with the power curve of Moran's I for this alternative."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f216376b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LMWX\",\"RLMWX\",\"Moran\"]\n",
+ "powresult.plot(x=\"gam\",y=testnames,\n",
+ " title=\" SLX Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8174464f",
+ "metadata": {},
+ "source": [
+ "Overall, the LM-WX test has good power, achieving a 100% rejection rate for $\\gamma = 0.2$. Not unsurprisingly, Moran's I has decent power against this alternative as well, illustrating its usefulness as a misspecification test. Its only starts to gain power around $\\gamma = 0.2$, but reaches 100% rejection rate for $\\gamma = 0.45$.\n",
+ "\n",
+ "On the other hand, the shape of the power curve for Robust LM-WS is bizarre. The test has essentially no power at all after correcting for the (inappropriate) presence of a spatial autoregressive term. This again illustrates how the robustness correction is not appropriate when the ignored alternative is not local."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a5e939b",
+ "metadata": {},
+ "source": [
+ "### SLX tests and Error tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d62196aa",
+ "metadata": {},
+ "source": [
+ "A second comparison rates the power curve for LM-WX against that for the LM-Error test and its robust form."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9dbb1c4d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LMWX\",\"RLMWX\",\"LM-Error\",\"RLM-Error\"]\n",
+ "powresult.plot(x=\"gam\",y=testnames,\n",
+ " title=\" SLX Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aa4022f0",
+ "metadata": {},
+ "source": [
+ "Whereas the LM-Error test has power against the SLX alternative, similar to that of Moran's I, its robust version does not. Since it corrects for an inappropriate DGP, its behavior is non-standard and it has essentially no power against this alternative. LM-Error has slightly less power than Moran's I (in the previous graph), but also reaches the 100% rejection rate for $\\gamma = 0.45$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4b91c7bc",
+ "metadata": {},
+ "source": [
+ "### SLX tests and Lag tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "88cd7427",
+ "metadata": {},
+ "source": [
+ "Another interesting comparison is between LM-WX and the power curves for LM-Lag and its robust form. In contrast to the LM-Error DGP, which has no commonality with the SLX DGP, the latter can be considered as a truncated form of the Lag DGP. Hence one would expect the Lag tests to have some power against the SLX alternative."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "41b7db1b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LMWX\",\"RLMWX\",\"LM-Lag\",\"RLM-Lag\"]\n",
+ "powresult.plot(x=\"gam\",y=testnames,\n",
+ " title=\" SLX Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2627dad7",
+ "metadata": {},
+ "source": [
+ "The Lag tests indeed show strong power against the SLX alternative. In fact, for small values of $\\gamma$, their power slightly exceeds that of LM-WX. In this case as well, the robust form of LM-Lag corrects for the wrong alternative. As a result, it tends to have slightly higher power than LM-Lag itself, which is not the proper behavior. All three tests achieve 100% rejection rate for $\\gamma = 0.2$. The graph for RLMWX is the same as before."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "56fbde58",
+ "metadata": {},
+ "source": [
+ "### SLX tests and higher error tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5feee3f2",
+ "metadata": {},
+ "source": [
+ "A final comparison is between the LM-WX test and the two higher order tests."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ede0ffdf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "testnames = [\"LMWX\",\"RLMWX\",\"LMSARER\",\"LMSDM\"]\n",
+ "powresult.plot(x=\"gam\",y=testnames,\n",
+ " title=\" SLX Alternative - Power Functions\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9845291",
+ "metadata": {},
+ "source": [
+ "In this case as well, the power curves of the higher order tests closely track that for LM-WX, achieving 100% rejection rate for $\\gamma = 0.2$. For the smaller values of $\\gamma$, the power of LM-SARER is slightly higher and that of LM-SDM slightly lower than that of LM-WX.\n",
+ "\n",
+ "This again highlights the caution that is needed when interpreting the results of the higher order tests. Both also have excellent power against one-directional alternatives, which can easily provide misleading guidance in a specification search."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65aadbff",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "The range of comparisons can easily be expanded using different spatial layouts and associated sample sizes as well as different DGP, such as a moving average error process, or various higher order DGP."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/11_distance_decay.ipynb.txt b/_sources/notebooks/11_distance_decay.ipynb.txt
new file mode 100644
index 00000000..77f755d8
--- /dev/null
+++ b/_sources/notebooks/11_distance_decay.ipynb.txt
@@ -0,0 +1,472 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# Distance Decay\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/18/2024)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, a closer look is taken at the distance decay implied by different kernel functions that are used in the linear SLX model, as well as by the nonlinear distance transformations used in the NSLX specification. Since these functions are derived for the k-nearest neighbors, the spatial patterning of these neighbors will affect how much relevance the values observed at those locations have for the spatial lag. For example, if the nearest neighbors are all rather far away from the origin, e.g., with a relative distance (z) of 0.5 or more, a steep distance decay will essentially eliminate their effect.\n",
+ "\n",
+ "### Prerequisites\n",
+ "\n",
+ "Familiarity with *numpy*, *pandas* operations and the `plot` function for *pandas* dataframes is assumed. Further customization can be carried out by means of specific features of *matplotlib*, but that is not considered here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6494b68c",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "Since no *spreg* functions are used, the imports are limited to *numpy*, *pandas* and *matplotlib.pyplot* (the latter is optional)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "e398e42f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "np.set_printoptions(legacy = \"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from numpy:\n",
+ " - linspace\n",
+ " - reshape\n",
+ " - hstack\n",
+ "\n",
+ "- from pandas:\n",
+ " - DataFrame\n",
+ " - concat\n",
+ " - plot\n",
+ " \n",
+ "- from matplotlib.pyplot:\n",
+ " - show\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "67da216d",
+ "metadata": {},
+ "source": [
+ "### Files and Variables\n",
+ "\n",
+ "In this notebook, no actual data are used."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3ca18922",
+ "metadata": {},
+ "source": [
+ "## Distance Metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71e1aa24",
+ "metadata": {},
+ "source": [
+ "To obtain comparable graphs, the distance is scaled to a value less than one. This corresponds to the $z$ value that is computed relative to the bandwidth. This approach avoids problems due to the varying scale of the actual distance metric (e.g., meters vs. km) and makes all distance decay functions comparable.\n",
+ "\n",
+ "For the purpose of graphing, 20 equidistance points are used in the interval 0-1, by means of `numpy.linspace`.\n",
+ "\n",
+ "In practice, the $z$ values are computed for up to k nearest neighbors, which are not necessarily equally spaced. In fact, the weight of neighbors will vary depending on whether they are close to the origin or closer to the farthest nearest neighbor."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "95f932d6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "z = np.linspace(0.0,1.0,num=21)\n",
+ "z = z.reshape(-1,1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8fe8a0df",
+ "metadata": {},
+ "source": [
+ "## Kernel Functions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ecced701",
+ "metadata": {},
+ "source": [
+ "### Triangular Kernel"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ba12119c",
+ "metadata": {},
+ "source": [
+ "The triangular kernel is $1.0 - z$. Note that in the SLX implementation, the value at the origin (diagonal) is set to zero. It is kept at 1.0 here to make the graph clearer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "78a29421",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tri = 1.0 - z\n",
+ "tri = tri.reshape(-1,1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f62b6952",
+ "metadata": {},
+ "source": [
+ "The kernel function is matched with the distance metric to create a simple plot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "848554c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "p1 = np.hstack((z,tri))\n",
+ "df1 = pd.DataFrame(p1,columns=[\"distance\",\"triangular\"])\n",
+ "df1.plot(x=\"distance\",y=\"triangular\",kind=\"line\",title=\"Triangular Kernel\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "af65df28",
+ "metadata": {},
+ "source": [
+ "As expected, the result is a linear decline with distance, yielding a weight equal to $1.0 - z$ at each point. The function terminates with a value of zero for $z = 1.0$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "35f2b7f4",
+ "metadata": {},
+ "source": [
+ "### Quadratic/Epanechnikov Kernel"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "03c7482b",
+ "metadata": {},
+ "source": [
+ "The quadratric or Epanechnikov kernel takes the value $1.0 - z^2$. Strictly speaking, this value is scaled by 3/4, but this can be ignored here. The resulting distance decay graph is obtained in the same way as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "90ead891",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "z2 = z**2\n",
+ "z2 = z2.reshape(-1,1)\n",
+ "epa = 1.0 - z2\n",
+ "p2 = np.hstack((z,epa))\n",
+ "df2 = pd.DataFrame(p2,columns=[\"distance\",\"quadratic\"])\n",
+ "df2.plot(x=\"distance\",y=\"quadratic\",kind=\"line\",title=\"Epanechnikov Kernel\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "553aab44",
+ "metadata": {},
+ "source": [
+ "Note how the curvature is above the line that connects the two endpoints, yielding generally higher weights for the neighbors than the corresponding triangular kernel weights."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d30dbb5a",
+ "metadata": {},
+ "source": [
+ "### Quartic Kernel"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "88e81846",
+ "metadata": {},
+ "source": [
+ "The quartic kernel is $(1.0 - z^2)^2$. Again, this is customary scaled by 15/16, which is ignored here. The graph is obtained in the same way."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "50bf33cc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "quar = (1.0 - z2) ** 2\n",
+ "p3 = np.hstack((z,quar))\n",
+ "df3 = pd.DataFrame(p3,columns=[\"distance\",\"quartic\"])\n",
+ "df3.plot(x=\"distance\",y=\"quartic\",kind=\"line\",title=\"Quartic Kernel\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b334715e",
+ "metadata": {},
+ "source": [
+ "Note the characteristic reverse S-shape with higher than linear weights close to the original, changing to less than linear values in the second half of the distance range.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "55c52e32",
+ "metadata": {},
+ "source": [
+ "### Gaussian Kernel"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "778b96cc",
+ "metadata": {},
+ "source": [
+ "The Gaussian kernel takes the form $\\sqrt{(2 \\pi)}. e^{-z^2/2}$. For the graph, the scaling factor is again ignored. Note that in contrast to the quadratic and quartic kernel functions, this would yield a value at the origin of larger than one, and generally remain above one in the distance range considered. It is therefore applied without the scaling factor. The graph is obtained in the same fashion as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32cd8caf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "zz = -z2 / 2.0\n",
+ "gs = np.exp(zz)\n",
+ "p4 = np.hstack((z,gs))\n",
+ "df4 = pd.DataFrame(p4,columns=[\"distance\",\"Gaussian\"])\n",
+ "df4.plot(x=\"distance\",y=\"Gaussian\",kind=\"line\",title=\"Gaussian Kernel\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f788fb57",
+ "metadata": {},
+ "source": [
+ "Similar to the pattern for the quadratic kernel, this yields weights above the diagonal for the distance range. Also note that the value at the end point is not zero, but around 0.6 (in fact, for $z = 1$, the kernel value is $e^{-1/2} = 0.6$)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9ef737e",
+ "metadata": {},
+ "source": [
+ "### Comparison"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "79857c9b",
+ "metadata": {},
+ "source": [
+ "To facilitate a comparison of the distance decay patterns generated by the different functions, they are next plotted together, anchored to the value of one at the origin."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8f5eda88",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pall = pd.concat((df1,df2[\"quadratic\"],df3[\"quartic\"],df4[\"Gaussian\"]),axis=1)\n",
+ "pall.plot(x=\"distance\",y=[\"triangular\",\"quadratic\",\"quartic\",\"Gaussian\"],kind=\"line\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f2b7b32",
+ "metadata": {},
+ "source": [
+ "The curves clearly show how the nonlinear functions provide differential weights with distance. Relative to the triangular kernel, only the quartic function penalizes more distant locations more. As pointed out earlier, the Gaussian kernel gives much higher weights than the other curves, implying very little distance decay."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "77f18355",
+ "metadata": {},
+ "source": [
+ "## Negative Exponential Distance Function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc914b9f",
+ "metadata": {},
+ "source": [
+ "The negative exponential distance function that is implemented for the nonlinear SLX model uses the transformation $e^{-\\alpha z}$, where $z$ is the same distance fraction as for the kernel functions. This function can be readily graphed for a range of values of the $\\alpha$ parameter. Note that $\\alpha$ is taken to be positive. Negative values for this coefficient are not allowed, since they would yield increasing weights with distance, violating Tobler's law. For $\\alpha = 0$, the weights are constant and equal to one."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c6e72b8f",
+ "metadata": {},
+ "source": [
+ "To illustrate the shape of the distance decay function for different parameter values, a range from 0 to 6.0 is considered. To graph the functions, only a simple computation is needed, which is implemented in a `for` loop."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff43c93b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "a = [0.0, 0.5, 1.0, 2.0, 6.0]\n",
+ "\n",
+ "p5 = z\n",
+ "for i in a:\n",
+ " za = - i * z\n",
+ " expon = np.exp(za)\n",
+ " p5 = np.hstack((p5,expon))\n",
+ "\n",
+ "cols = [\"exp\" + str(i) for i in a]\n",
+ "cols.insert(0,\"distance\")\n",
+ "\n",
+ "df5 = pd.DataFrame(p5,columns=cols)\n",
+ "df5.plot(x=\"distance\",y=cols[1:],kind=\"line\",title=\"Negative Exponential Distance\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a335ca6f",
+ "metadata": {},
+ "source": [
+ "Using $\\alpha = 1.0$ as a reference, values less than 1.0 yield a much slower distance decay and vice versa. Note that the value of the weight at the end point is $e^{-\\alpha}$, which is larger than zero for small values of $\\alpha$. Once $\\alpha$ becomes larger, the distance decay becomes very steep. As illustrated in the graph, for a value of 6.0, the weights are essentially zero for $z > 0.5$, wiping out the effect of any nearest neighbors beyond that distance. It is important to keep this in mind when interpreting the coefficient estimates in a NSLX model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "46c208e1",
+ "metadata": {},
+ "source": [
+ "## Inverse Distance Power Function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "46b3a138",
+ "metadata": {},
+ "source": [
+ "The inverse distance power transformation takes the form $(1.0 - z)^{\\alpha}$, with $\\alpha$ taking on a postive value. Using the same procedure as before, the implied distance decay can be graphed for a range of parameter values. For $\\alpha = 0$, the curve is again horizontal."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fff8c90e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "a = [0.0, 0.5, 1.0, 2.0, 6.0]\n",
+ "\n",
+ "p6 = z\n",
+ "zz = 1.0 - z\n",
+ "\n",
+ "for i in a:\n",
+ " za = zz**i\n",
+ " p6 = np.hstack((p6,za))\n",
+ "\n",
+ "cols = [\"pow\" + str(i) for i in a]\n",
+ "cols.insert(0,\"distance\")\n",
+ "\n",
+ "df6 = pd.DataFrame(p6,columns=cols)\n",
+ "df6.plot(x=\"distance\",y=cols[1:],kind=\"line\",title=\"Inverse Distance Power Function\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d7ec2734",
+ "metadata": {},
+ "source": [
+ "In contrast to the negative exponential function, the weights end up (by construction) with a value of zero at the bandwidth distance. Coefficient values less than one yield a distance decay with much larger weights for shorter distances, only dropping off more rapidly in the farthest distance range. Values of $\\alpha$ larger than one imply a much steeper (steeper than a linear curve) decay. Even more so than for the negative exponential function, larger values for $\\alpha$ yield essentially zero weights for smaller and smaller distances. In the graph, this is already the case for $z$ around 0.4, again eliminating the impact of nearest neighbors that are farther away."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65aadbff",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "It is straightforward to experiment with some other parameter values and possibly other distance transformations. Consider some other kernel functions, currently not supported by PySAL, such as the tricube ($(1.0 - z^3)^3$) or cosine kernel ($(\\pi/4).cos((\\pi/2) z)$).\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/12_estimating_slx.ipynb.txt b/_sources/notebooks/12_estimating_slx.ipynb.txt
new file mode 100644
index 00000000..44f09897
--- /dev/null
+++ b/_sources/notebooks/12_estimating_slx.ipynb.txt
@@ -0,0 +1,864 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# Estimating SLX Models\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/17/2024)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, a closer look is taken at the estimation of SLX models, both the traditional linear specification as well as more recently introduced nonlinear forms. \n",
+ "\n",
+ "The `spreg` package implements the inclusion of spatially lagged explanatory variables in any specification by means of a non-zero `slx_lags` argument. This allows for the estimation of non-spatial linear models by means of OLS, as well as spatial Durbin and SLX-Error models by means of specialized methods. In addition, as of Version 1.7, the `NSLX` module introduces the estimation of nonlinear SLX models, such as a negative exponential distance function and an inverse distance power transformation. The `NSLX` module is still somewhat experimental and under active development, but the current version is stable.\n",
+ "\n",
+ "### Prerequisites\n",
+ "\n",
+ "Familiarity with the basic setup of regressions in *spreg* as well as essentials of *numpy*, *pandas*, *geopandas*, and *libpysal* is assumed. In addition, it is assumed that the **chicagoSDOH** PySAL sample data set has been installed (for specific instructions, refer to the *sample data notebook*)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d8a50738",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The main modules needed are `spreg.OLS` and `spreg.NLSX`. In addition, *libpysal* is needed for data import and spatial weights construction, and *geopandas* for data input from a shape file. This notebook is based on version 1.7 of *spreg*. \n",
+ "\n",
+ "As before, only the needed functions from *libpysal* are imported, specifically, `get_path` from `libpysal.examples`, and `libpysal.weights` as `weights`. \n",
+ "\n",
+ "Some additional imports are included to avoid excessive warning messages. With later versions of PySAL, these may not be needed. As before, the `set_printoptions` is used for *numpy* 2.0 and later."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e398e42f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "from libpysal.examples import get_path\n",
+ "import libpysal.weights as weights\n",
+ "from spreg import OLS, NSLX\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functionality Used\n",
+ "\n",
+ "- from geopandas:\n",
+ " - read_file\n",
+ " \n",
+ "- from libpysal:\n",
+ " - examples.get_path\n",
+ " - weights.Queen.from_dataframe\n",
+ " - weights.Kernel\n",
+ " \n",
+ "- from spreg:\n",
+ " - OLS\n",
+ " - NSLX"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b34442e2",
+ "metadata": {},
+ "source": [
+ "### Data, Weights and Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "effd05a4",
+ "metadata": {},
+ "source": [
+ "As in the previous notebooks, all data sets, weights files and variables are specified at the top, so that they can be easily changed to other examples.\n",
+ "\n",
+ "The data set is from the **chicagoSDOH** sample data set:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic indicators of health for 2014 in 791 Chicago tracts\n",
+ "\n",
+ "Contiguity weights and kernel weights are constructed by means of the `libpysal.weights` functionality. In addition to queen contiguity, adaptive bandwidth triangular kernel weights and adaptive bandwidth quadratic kernel weights are constructed with k=10. The contiguity weight is used in row-standardized form, the kernel weights are kept as is.\n",
+ "\n",
+ "The model specification is purely to illustrate the various estimation methods and relates the variable **HIS_ct** (economic hardship index) to **Blk14P** (percentage Black households), **Hisp14P** (percentage Hispanic households), and **EP_NOHSDP** (percentage households without high school education).\n",
+ "\n",
+ "The centroid coordinates **COORD_X** and **COORD_Y** are used to construct kernel weights.\n",
+ "\n",
+ "The various initializations are carried out in two steps:\n",
+ "\n",
+ "- first, all file names and variable names are defined\n",
+ "- second, the files are read, the variables extracted, and the spatial weights constructed\n",
+ "\n",
+ "The first step allows for customization to other examples, the second step is agnostic to the actual files and variables that were specified. To keep the code simple, there are no error checks for missing files or mismatches in the variable names."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "829cca11",
+ "metadata": {},
+ "source": [
+ "#### Specify file and variable names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "fc2dba41",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "y_name = 'HIS_ct'\n",
+ "x_names = ['Blk14P','Hisp14P','EP_NOHSDP']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "wq_name = 'Chi-SDOH_q'\n",
+ "wk10_name = 'Chi-SDOH_tri10'\n",
+ "wkq10_name = 'Chi-SDOH_quad10'\n",
+ "coordname = [\"COORD_X\",\"COORD_Y\"]\n",
+ "idvar = \"ID\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91648058",
+ "metadata": {},
+ "source": [
+ "#### Read files and extract variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4103013d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "wq = weights. Queen.from_dataframe(dfs,ids=idvar)\n",
+ "wq.transform = 'r' # row-transform the weights\n",
+ "y = dfs[y_name]\n",
+ "x = dfs[x_names]\n",
+ "crdnts = dfs[coordname]\n",
+ "wk10 = weights.Kernel(np.array(crdnts),k=10,fixed=False,diagonal=True)\n",
+ "wkq10 = weights.Kernel(np.array(crdnts),k=10,function=\"quadratic\",fixed=False,diagonal=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd54d5c2",
+ "metadata": {},
+ "source": [
+ "## Estimating SLX Models\n",
+ "\n",
+ "As of Version 1.7 of *spreg*, several options to estimate the SLX model are available that go beyond the simple inclusion of spatially lagged explanatory variables by means of a non-zero `slx_vars` argument. A distinction is made between a linear model and a nonlinear model. In the linear model, both the usual row-standardized spatial weights are supported, as well as kernel weights (not row-standardized). The kernel weights can be conceived as introducing a form of nonlinearity, albeit in the weights and not in the coefficients. In contrast to other usage of the kernel weights (e.g., for HAC standard errors), in the SLX estimation, the diagonal elements are set to zero (and not kept as 1.0).\n",
+ "\n",
+ "The nonlinear estimation is implemented for a negative exponential distance function and an inverse distance power function. Instead of using the actual distance between observations, which is dependent on the scale of the metric chosen (e.g., meters vs. miles), the actual distance within a given bandwidth (either fixed or adaptive) is converted to a fraction of the bandwidth distance, ensuring that its range is between 0 and 1. More precisely, the distance is converted to a fraction of the distance to the nearest neighbor within the bandwidth (for fixed bandwidth) that is farthest away. For an adaptive bandwidth, this is always the k-nearest neighbor, for a fixed bandwidth, it is typically to a nearest neighbor for less than k.\n",
+ "\n",
+ "The exponential model transformation is based on $e^{-\\alpha z_{ij}}$, where $z_{ij} = d_{ij}/bw$, for $d_{ij} \\le bw$, and zero otherwise. Note that the estimate for $\\alpha$ should be positive and larger than one. The negative exponential transformation is built-in, so that the estimate for $\\alpha$ should be a positive value. The current implementation in *spreg* has a different $\\alpha$ parameter for each explanatory variable. \n",
+ "\n",
+ "For the power model, the transformation is $z_{ij}^\\alpha$ with $z_{ij} = 1 - d_{ij}/bw$, for $d_{ij} \\le bw$, and zero otherwise. In this context, the estimate for $\\alpha$ should be positive and larger than one. Note that since $z_{ij} \\lt 1$, the power $z_{ij}^\\alpha$ quickly becomes negligible (i.e., basically zero) for larger values of $\\alpha$. Here too, a different parameter is estimated for each explanatory variable.\n",
+ "\n",
+ "The estimation of the NSLX model uses a nonlinear optimization routine that is quite sensitive to the model specification. In ill-behaved specifications, nonsensical parameter estimates may result. In some instances, the model may fail to optimize altogether, yielding a `nan` for the objective function. Such cases should be taken as an indication of a poor specification and alternatives should be considered. In addition, one should keep in mind that the estimated coefficients should be small. Large coefficients are applied to the transformed distance, which is a fraction less than one. For example, a coefficient of 10 in a power function applied to a z-distance of 0.2 results in a weight of 0.00000001, essentially zeroing out the lagged variable. Such large values can also easily result in singular covariance matrices.\n",
+ "\n",
+ "The parameter estimates are also very sensitive to the choice of the bandwidth, a common attribute of kernel estimation methods. Since these are distance decay functions, larger values for the $\\alpha$ parameters imply a steeper decline with distance. Everything else being the same, the same actual distance decay will tend to yield larger parameter values for a larger bandwidth, and vice versa. Unlike what holds in the linear model, the actual parameter values are not easy to interpret in an absolute sense. However, their effect on the multipliers can be assessed by means of the methods covered in an earlier notebook.\n",
+ "\n",
+ "For all models, there is the flexibility to only lag some of the explanatory variables. In addition, for the nonlinear models, it is possible to apply a different transformation to each variable. Unless there is a good theoretical reason to do so, this is generally not recommended.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "59b80d13",
+ "metadata": {},
+ "source": [
+ "## Linear SLX"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c5c09d3d",
+ "metadata": {},
+ "source": [
+ "### Spatial Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "942c1d1e",
+ "metadata": {},
+ "source": [
+ "#### OLS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3d8d40ea",
+ "metadata": {},
+ "source": [
+ "As a point of reference, the first estimation is for a classic regression by means of OLS. Spatial diagnostics are included as well."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "61174574",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1 = OLS(y,x,w=wq,spat_diag=True,\n",
+ " name_w=wq_name,name_ds=ds_name)\n",
+ "print(ols1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "560f607a",
+ "metadata": {},
+ "source": [
+ "The three regression coefficients are positive and highly significant. The overall fit is quite good, with an adjusted $R^2$ of 0.82 and associated sum of squared residuals of 26,459.9. Nevertheless, all the spatial diagnostics and their robust forms are highly significant, pointing to the presence of spatial effects. Specifically, the LM test for WX and its robust form are both significant as well, although it would seem that a lag model and/or a spatial Durbin specification may be most appropriate as the alternative.\n",
+ "\n",
+ "Importantly, the values of the robust statistics are all smaller than the original statistic, suggesting the appropriateness of the alternatives considered. In other words, all spatial models considered could be viable alternatives."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7079f324",
+ "metadata": {},
+ "source": [
+ "#### SLX "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ac32507d",
+ "metadata": {},
+ "source": [
+ "The linear SLX model with queen contiguity weights is invoked by means of `OLS` with the usual arguments, and with `slx_lags=1`. Spatial diagnostics are included as well (`spat_diag=True`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c368c21",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxw1 = OLS(y,x,w=wq,slx_lags=1,\n",
+ " spat_diag=True,\n",
+ " name_w=wq_name,name_ds=ds_name)\n",
+ "print(slxw1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "12455b27",
+ "metadata": {},
+ "source": [
+ "The overall fit of the model improves only slightly, to an adjusted $R^2$ of 0.84, with a sum of squared residuals of 23,475.4. All lag terms are highly significant, but two of them are negative (**W_Blk14P** and **W_Hisp14P**), while the third one (**W_EP_NOHSDP**) is positive. The interpretation of negative coefficients for the spatial lags is difficult, so this may point to a misspecification. While significant, the coefficient of **W_EP_NOHSDP** is slightly larger than that of the coefficient itself, which runs counter to Tobler's law.\n",
+ "\n",
+ "The inclusion of the SLX terms changes the spatial diagnostics. The original LM-Lag and LM-Error tests are still (very) significant, but their robust forms are not."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8d7b8d45",
+ "metadata": {},
+ "source": [
+ "#### Higher order lags"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8d3f4bfb",
+ "metadata": {},
+ "source": [
+ "Higher order lags are obtained by setting `slx_lags` to a value larger than one. However, this very quickly leads to problems with multicollinearity. To illustrate this property, the option `vif` is set to `True`. With `slx_lags = 2`, the other arguments are as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1821f3e7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxw2 = OLS(y,x,w=wq,slx_lags=2,\n",
+ " spat_diag=True,vif=True,\n",
+ " name_w=wq_name,name_ds=ds_name)\n",
+ "print(slxw2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9fe8f86a",
+ "metadata": {},
+ "source": [
+ "The inclusion of the second order lags results in all lag terms but the first order lag of **EP_NOHSDP** to become non-significant. In addition, the multicollinearity is very problematic. The condition number is 89, well above the usual acceptable range. The VIF statistics show the problem with the spatially lagged variables, with a VIF value of almost 157 for **W2_Blk14P**. The effect on the spatial diagnostics is the same as with the first order lags: both LM-Lag and LM-Error tests are significant, but their robust versions are not.\n",
+ "\n",
+ "Clearly, in this example, nothing is gained by including the higher order lags."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "621ff764",
+ "metadata": {},
+ "source": [
+ "#### Selective lagged variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73523ef9",
+ "metadata": {},
+ "source": [
+ "Since the coefficient of **W_EP_NOHSDP** was least problematic in the first order model, `slx_vars` is used to remove the other lag terms. To this effect, it is set to the list `[False,False,True]` (the default is otherwise `slx_vars=\"All\"`), which will eliminate **W_Blk14P** and **W_Hisp14P** from the regression specification. All the other arguments are as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b600025",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxw3 = OLS(y,x,w=wq,slx_lags=1,slx_vars=[False,False,True],\n",
+ " spat_diag=True,\n",
+ " name_w=wq_name,name_ds=ds_name)\n",
+ "print(slxw3.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "af11582b",
+ "metadata": {},
+ "source": [
+ "The inclusion of the spatial lag for just **EP_NOHSDP** makes the coefficient of **Hisp14P** not significant. On the other hand, the coefficient of the lag term is now smaller than that of the non-lagged variable, conforming to Tobler's law. The fit still gives an adjusted $R^2$ of 0.84, with a residual sum of squares of 24,117.4.\n",
+ "\n",
+ "The spatial diagnostics seem to point to potential spatial error dependence, with both the LM test and its robust form highly significant."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7d520a1",
+ "metadata": {},
+ "source": [
+ "### Kernel Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ecc2d5e4",
+ "metadata": {},
+ "source": [
+ "Kernel weights can be passed to an estimation routine in the same way as other weights, but only for `slx_lags = 1`. If a higher order lag is specified, `spreg` will raise an `Exception`. Also, the interpretation of the results is different from the standard linear case. While the diagonal elements as set to zero, just as for other spatial weights, kernel weights are *not* row-standardized. As a result, the magnitude of the associated coefficients is not directly comparable to the non-lagged counterparts. A detailed look at the implied multiplier effects can be gained by means of the `i_multiplier` function, as illustrated in a previous notebook.\n",
+ "\n",
+ "Also, importantly, the results of the diagnostics for spatial effects are not valid for kernel weights without adjustments, so they cannot be activated.\n",
+ "\n",
+ "In all other respects, the command is identical to the standard case."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1bc3787c",
+ "metadata": {},
+ "source": [
+ "#### Triangular kernel"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6853b6d6",
+ "metadata": {},
+ "source": [
+ "The first example is for adaptive bandwidth triangular kernel weights with k=10, contained in **wk10**. Note that `spat_diag=True` is specified, even though this option is not available for kernel weights. As a result, the output will include a warning."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0a7eb79a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxk1 = OLS(y,x,w=wk10,slx_lags=1,spat_diag=True,\n",
+ " name_w=wk10_name,name_ds=ds_name)\n",
+ "print(slxk1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e92d1ebd",
+ "metadata": {},
+ "source": [
+ "As in the linear case, all lag coefficient are significant, with negative signs for **W_Blk14P** and **W_Hisp14P**. In contrast to the linear case, the magnitude of the coefficients of the lag terms is not directly comparable to the original coefficient.\n",
+ "\n",
+ "In terms of fit, the results are similar to that of the linear model, with an adjusted $R^2$ around 0.84 and a sum of squared residuals of 23,811.7 (compared to 23,475.4 for queen contiguity).\n",
+ "\n",
+ "The results for the reduced model follow."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c9bbb74",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxk2 = OLS(y,x,w=wk10,slx_lags=1,spat_diag=True,\n",
+ " slx_vars=[False,False,True],\n",
+ " name_w=wk10_name,name_ds=ds_name)\n",
+ "print(slxk2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d737d812",
+ "metadata": {},
+ "source": [
+ "The same effect as in the linear model is observed, with **Hisp14P** becoming non-significant. The model fit is barely affected, with a sum of squared residuals of 24,602.7."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ed9bfa72",
+ "metadata": {},
+ "source": [
+ "#### Quadratic kernel"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e20b5236",
+ "metadata": {},
+ "source": [
+ "An alternative kernel function is the adaptive bandwidth quadratic function with k=10, given by **wkq10**. This second specification is considered next, first for all three variables, then for just **W_NOHSDP**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "df24025f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxk3 = OLS(y,x,w=wkq10,slx_lags=1,\n",
+ " slx_vars = \"All\",\n",
+ " name_w=wkq10_name,name_ds=ds_name)\n",
+ "print(slxk3.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7d0d3bd4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxk4 = OLS(y,x,w=wkq10,slx_lags=1,\n",
+ " slx_vars = [False,False,True],\n",
+ " name_w=wkq10_name,name_ds=ds_name)\n",
+ "print(slxk4.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "afbb1cbb",
+ "metadata": {},
+ "source": [
+ "The same pattern is observed as for the other weights. With the full model, all coefficients are highly significant, but **W_Blk14P** and **W_Hisp14P** have negative signs. When just **W_EP_NOHSDP** is included, **Hisp14P** becomes insignificant.\n",
+ "\n",
+ "The fit of the three specifications considered so far is very similar. Comparing the sum of squared residuals (the criterion of fit also used for the nonlinear models) for the reduced model, the linear SLX model achieves 24,117.4, the triangular kernel 24,602.7, and the quadratic kernel 24,535.3. In this example, there is very little difference between the three models."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "248cca95",
+ "metadata": {},
+ "source": [
+ "## Estimating Nonlinear SLX Models"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6a89d6dd",
+ "metadata": {},
+ "source": [
+ "As mentioned, estimating nonlinear models can be quite tricky. The optimization routines are sensitive to model (mis)specifications and various numerical approximations can yield nonsensical results. So far, the `NSLX` module implements two specifications, a negative exponential distance function and an inverse distance power function. \n",
+ "\n",
+ "`NSLX` uses the `minimize` routine from `scipy.optimize` as the nonlinear optimizer. The default method is `BFGS` (Broyden, Fletcher, Goldfard and Shanno), a quasi-Newton method that uses a numerical approximation for the first derivatives. Starting values for the parameters are the OLS regression estimates for the $\\beta$ coefficients and 1.0 for the $\\alpha$ coefficients (this is currently hard-coded). The optimization method also returns an approximation of the inverse Hessian, which can be used to compute asymptotic standard errors. However, as documented online in various forums, depending on the model, the approximation can be quite crude.\n",
+ "\n",
+ "Since the nonlinear SLX model consists of both a linear and a nonlinear part that are unrelated, the associated variance-covariance matrix is block-diagonal. The block for the linear part is the familiar $\\sigma^2 (X'X)^{-1}$, which is easy to compute. The block for the nonlinear part is $\\sigma^2 (\\hat{X}'\\hat{X})^{-1}$, where $\\hat{X}$ consists of vectors of partial derivatives with respect to the nonlinear parameters, with their final values substituted. Both blocks are relatively straightforward to compute analytically. Specifically, the expressions for the partial derivatives with respect to $\\alpha$ are $- e^{-\\alpha.z}.ln(\\alpha.z).z$ for the exponential model, and $z^\\alpha.ln(z)$ for the power model.\n",
+ "\n",
+ "The nonlinear part is notoriously ill-behaved, especially for the power model when the parameter estimates are much larger than one. In such instances, the model should be viewed with suspicion, given the problems with such powers, even though the numerical approximation may yield what looks like a valid variance-covariance function.\n",
+ "\n",
+ "The `NSLX` class takes a set of arguments that is slightly different from that for the other regression functions. As usual, `y` and `x` are required, but instead of passing spatial weights, a dataframe or numpy array with point coordinates is required as the `coords` argument. The coordinate values are used to build the scaled distance weights. The parameters for those weights are passed as `params`, which is a list of tuples consisting of three elements: the number of nearest neighbors (`k`), the bandwidth (`distance_upper_bound`), and the transformation. In the most typical application, the same parameters will be used for all the lagged variables so that only a single tuple needs to be specified in the `params` list. However, there is the flexibility to apply a separate set of parameters (and thus a different transformation) to each variable. The sequence of tuples in the list must match the sequence of variable names.\n",
+ "\n",
+ "The number of nearest neighbors is used to build the underlying KDTree, and also serves to derive the bandwidth. With the second item as `np.inf`, the bandwidth is adaptive and determined by the distance to the farthest k-nearest neighbor for each observation. For a fixed bandwidth, a distance upper bound must be specified. This is not trivial and depends on the scale of the coordinates (e.g., whether the distance will be in meters or kilometers). \n",
+ "\n",
+ "When a distance_upper_bound is set that is larger than the largest k-nearest neighbor distance, there is no effect. In order to be effective, the distance_upper_bound must be less than the maximum k-nearest neighbor distance for a given point. In this instance, it has the effect of imposing a fixed bandwidth, and it truncates the number of nearest neighbors to those within the bandwidth. As a result, the number of neighbors will be less than k. When a strict fixed bandwidth needs to be imposed, k should be set large enough (and `distance_upper_bound` small enough) so that the bandwidth is effective for every observation.\n",
+ "\n",
+ "The third argument is the transformation, either `\"exponential\"` or `\"power\"`. In contrast to the other SLX models, `slx_lags` is *not* an argument, but `slx_vars` remains available to impose selective lag transformations.\n",
+ "\n",
+ "Other arguments include a flag for the type of variance computed, with `var_flag = 1` (the default) for an analytical computation, and `var_flag = 0` for the inverse Hessian approximation. There is also a flag for a listing of a summary of convergence conditions, with `conv_flag = 1` for such a listing (`conv_flag=0` is the default). Finally, there is a `verbose` flag which lists the intermediate results for all iterations (`verbose = False` is the default to avoid sometimes very long listings), and an option to include `minimize`-specific options, as `options`, a dictionary with specific solver options (see the `scipy.optimize` documentation at https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html).\n",
+ "\n",
+ "A final remark is that the results of nonlinear optimization may vary by hardware, operating system and software versions. As a consequence, slight differences in the parameter estimates may occur."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fe64c5ad",
+ "metadata": {},
+ "source": [
+ "## Negative Exponential Distance"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "743c3800",
+ "metadata": {},
+ "source": [
+ "#### Adaptive bandwidth"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "39812a09",
+ "metadata": {},
+ "source": [
+ "The first two examples use an adaptive bandwidth with, respectively, 6 (**parme1**) and 10 (**parme2**) k-nearest neighbors. The adaptive bandwidth is given by `np.inf` and the transformation as `\"exponential\"`. The coordinates for the weights computation are contained in the numpy array **crdnts**, and **y** and **x** are as before. In the first example, all explanatory variables are lagged, with `slx_vars = \"All\"`. This is the default, so it does not need to be specified as an argument. It is included here for clarity.\n",
+ "\n",
+ "Analytical standard errors are obtained with `var_flag = 1` and a summary of the convergence properties is given with `conv_flag = 1`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "be589711",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "parme1 = [(6,np.inf,\"exponential\")]\n",
+ "parme2 = [(10,np.inf,\"exponential\")]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fd0f2cb8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nslxe1 = NSLX(y=y,x=x,coords=crdnts,params=parme1,slx_vars=\"All\",\n",
+ " var_flag=1,conv_flag=1,\n",
+ " name_ds=ds_name,verbose=False)\n",
+ "print(nslxe1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6019b93c",
+ "metadata": {},
+ "source": [
+ "Even though the optimization process yields estimates, they are clearly suspect, given the associated (analytical) standard errors. The problems seem to be associated with **We_Blk14P** and **We_Hisp14P**, whereas the estimate for **We_EP_NOHSDP** seems more reasonable. The remainder of this example will proceed with just the latter.\n",
+ "\n",
+ "Parenthetically, the convergence summary indicates that 95 iterations were used, but gives a success rate of **False**. In this example, this is likely due to some rounding issues and can be ignored. Note that the message states \"not necessarily achieved\". A good indication of the extent to which the optimization worked is to compare the sum of squared residuals to that achieved by the other models. At a value of 25,873.8, this is somewhat worse than for the other models, but in the same general ballpark.\n",
+ "\n",
+ "The next model is for the restricted specification."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8bcac6eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nslxe2 = NSLX(y=y,x=x,coords=crdnts,params=parme1,slx_vars=[False,False,True],\n",
+ " var_flag=1,conv_flag=1,\n",
+ " name_ds=ds_name,verbose=False)\n",
+ "print(nslxe2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4131ac5c",
+ "metadata": {},
+ "source": [
+ "The estimated coefficients are identical to those in the full model, and so is the model fit. The only slight difference is in the analytical standard errors for the coefficient of **We_EP_NOHSDP** (0.18474 vs. 0.18572). Also, the number of iterations to convergence is quite a bit smaller than in the previous example (32 vs. 95).\n",
+ "\n",
+ "To illustrate the effect of the variance computation, the model is rerun with `var_flag = 0` for a numerical approximation using the inverse Hessian."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa9ec229",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nslxe3 = NSLX(y=y,x=x,coords=crdnts,params=parme1,slx_vars=[False,False,True],\n",
+ " var_flag=0,conv_flag=1,\n",
+ " name_ds=ds_name,verbose=False)\n",
+ "print(nslxe3.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0c8e82a8",
+ "metadata": {},
+ "source": [
+ "The estimates are again identical, but the associated standard errors are quite different. The analytical derivation for the regression coefficients can be used as a standard for comparison, since it should be $\\sigma^2 (X'X)^{-1}$. Clearly, this is not the case for the numerical approximation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f51b395f",
+ "metadata": {},
+ "source": [
+ "The effect of the bandwidth is explored by setting `k = 10`, as in **parme2**. Only the restricted model is considered, with analytical standard errors."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "45c8e55b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nslxe4 = NSLX(y=y,x=x,coords=crdnts,params=parme2,slx_vars=[False,False,True],\n",
+ " var_flag=1,conv_flag=1,\n",
+ " name_ds=ds_name,verbose=False)\n",
+ "print(nslxe4.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0931a3cd",
+ "metadata": {},
+ "source": [
+ "The regression coefficients are essentially the same as for the model with `k = 6`, but the lag coefficient is larger. Intuitively, this makes sense if the true range of interaction is less than suggested by the bandwidth of `k = 10`. As a consequence, the distance decay should be steeper, implied by the larger coefficient. Otherwise, the fit is marginally better, with a sum of squared residuals of 25,594.8 (compared to 25,873.8)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8017b798",
+ "metadata": {},
+ "source": [
+ "#### Fixed bandwidth"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d7979aed",
+ "metadata": {},
+ "source": [
+ "Two upper distance bounds are specified to assess the role of a fixed bandwidth, i.e., 20.0 and 6.0, with `k = 6`. This is specified in the parameter lists **parme3** and **parme4**. Only the restricted model is considered.\n",
+ "\n",
+ "To provide some context, for the Chicago SDOH data set, a distance band that ensures that each observation has at least one neighbor (the max-min nearest neighbor distance) is 8.99. For a bandwidth distance of 20, the number of neighbors ranges from 11 to 268, with a median of 153! For a bandwidth distance of 6, there are four observations that become isolates. The number of neighbors ranges from 0 to 40, with 17 as the median."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "d6390fd8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "parme3 = [(6,20.0,\"exponential\")]\n",
+ "parme4 = [(6,6.0,\"exponential\")]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c57157a0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nslxe5 = NSLX(y=y,x=x,coords=crdnts,params=parme3,slx_vars=[False,False,True],\n",
+ " var_flag=1,conv_flag=1,\n",
+ " name_ds=ds_name,verbose=False)\n",
+ "print(nslxe5.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5785f701",
+ "metadata": {},
+ "source": [
+ "The result is identical to the case with an adaptive bandwidth. Even though the listing mentions an upper bound distance of 20, this is ineffective and the k-nearest neighbor criterion dominates.\n",
+ "\n",
+ "With an upper bound distance of 6 (**parme4**), the results for the coefficient estimates and model fit are slightly different (the latter is slightly worse, with a sum of squared residuals of 26,000.7).\n",
+ "\n",
+ "In general, unless there are good theoretical reasons, an adaptive bandwidth is preferred, since it ensures that there is an equal number of neighbors to each observation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "00e8b2a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nslxe6 = NSLX(y=y,x=x,coords=crdnts,params=parme4,slx_vars=[False,False,True],\n",
+ " var_flag=1,conv_flag=1,\n",
+ " name_ds=ds_name,verbose=False)\n",
+ "print(nslxe6.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0e451ee",
+ "metadata": {},
+ "source": [
+ "## Inverse Distance Power"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "78fbd3c7",
+ "metadata": {},
+ "source": [
+ "#### Adaptive bandwidth"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7c062181",
+ "metadata": {},
+ "source": [
+ "In contrast to the exponential, case, where the effect of bandwidth was explored, here only one case is considered, for an adaptive bandwidth with 6 k-nearest neighbors (**parmp1**). The adaptive bandwidth is given by `np.inf` and the transformation as `\"power\"`. The coordinates for the weights computation are contained in the numpy array **crdnts**, and **y** and **x** are as before. In the first example, all explanatory variables are lagged, with `slx_vars = \"All\"`. This is the default, so it does not need to be specified as an argument. As before, it is included here for clarity.\n",
+ "\n",
+ "Analytical standard errors are obtained with `var_flag = 1` and a summary of the convergence properties is given with `conv_flag = 1`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1697b09f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "parmp1 = [(10,np.inf,\"power\")]\n",
+ "nslxp1 = NSLX(y=y,x=x,coords=crdnts,params=parmp1,slx_vars=\"All\",\n",
+ " var_flag=1,conv_flag=1,verbose=False,\n",
+ " name_ds=ds_name)\n",
+ "print(nslxp1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3d1fcd4e",
+ "metadata": {},
+ "source": [
+ "What happened? The routine produces coefficient estimates, but no standard error or p-values. Closer examination of the results reveals that the **Sum squared residual** has a value of **nan**. The sum of squared residuals is the objective function that is minimized by the nonlinear optimization routine. The result of **nan** indicates a lack of convergence. Also, only two iterations were performed.\n",
+ "\n",
+ "This is not a surprise given the very large values for the lag (and other) coefficients. The regression coefficients in particular should be similar to those obtained in OLS and the linear models, which is clearly not the case.\n",
+ "\n",
+ "In an attempt to remedy this, the same approach is used as before. The estimation is carried out with the lags for **Blk14P** and **Hisp14P** excluded, using `slx_vars = [False,False,True]`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8a2bf9ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nslxp2 = NSLX(y=y,x=x,coords=crdnts,params=parmp1,slx_vars=[False,False,True],\n",
+ " var_flag=1,conv_flag=1, verbose=False,\n",
+ " name_ds=ds_name)\n",
+ "print(nslxp2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ad182dcb",
+ "metadata": {},
+ "source": [
+ "Now, the model seems to converge and yields a parameter value for **Wp_EP_NOHSDP** of 6.30, but no standard errors. The standard errors for the regression coefficients are not affected, but the value for $\\alpha$ causes the matrix product $(\\hat{X}'\\hat{X})$ to be singular. \n",
+ "\n",
+ "The numerical approximation for the inverse Hessian does yield approximate standard errors, but these should be viewed with caution. This is accomplished by setting `var_flag = 0`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6089ca55",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nslxp3 = NSLX(y=y,x=x,coords=crdnts,params=parmp1,slx_vars=[False,False,True],\n",
+ " name_ds=ds_name,\n",
+ " var_flag=0,conv_flag=1)\n",
+ "print(nslxp3.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2728f7be",
+ "metadata": {},
+ "source": [
+ "The approximation suggests the coefficient is significant. Of all the SLX models, the power function achieves the worst fit (but only marginally so), with a sum of squared residuals of 25,742.3 (compared to 25,594.8 for a similar exponential model).\n",
+ "\n",
+ "Clearly, in the current example, there is little gain in going with the nonlinear specification over the linear one. Of all the models, the best fit is achieved by the linear SLX specification based on queen contiguity weights, with a sum of squared residuals of 24,117. However, given the sensitivity of the results to the choice of weights, bandwidth and functional specification, further experimentation would be warranted."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65aadbff",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "The SLX model provides the opportunity to compare a wide range of specifications, using different weights and assessing the possible contribution of nonlinear specifications. Try this out for your own baseline model. However, if the diagnostics for spatial dependence in the OLS estimation do not show a significant value for LM-WX statistic, you may need to try a different base specification to obtain meaningful results. Pay particular attention to the interpretation of the coefficients of the lag terms in relation to Tobler's law and the sensitivity of the estimates to the choice of a bandwidth."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/13_ML_estimation_spatial_lag.ipynb.txt b/_sources/notebooks/13_ML_estimation_spatial_lag.ipynb.txt
new file mode 100644
index 00000000..e07530a9
--- /dev/null
+++ b/_sources/notebooks/13_ML_estimation_spatial_lag.ipynb.txt
@@ -0,0 +1,887 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "89145993",
+ "metadata": {},
+ "source": [
+ "# Maximum Likelihood Estimation - Spatial Lag Model\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/19/2024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a85427f9",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "This notebook is the first of two that deal with the estimation of the spatial Lag model and the spatial Durbin model. Here, the Maximum Likelihood approach is illustrated. Instrumental variable estimation is considered in a separate notebook.\n",
+ "\n",
+ "The maximum likelihood estimation in `spreg` is primarily included for pedagogical purposes. Generally, the instrumental variables approach is preferred. In addition, an optimal maximum likelihood estimation implementation, based on the Smirnov-Anselin (2001) approximation, is not currently implemented in `spreg`. It is implemented in C++ in `GeoDa`. This is the preferred approach for ML estimation in large(r) data sets, although it currently does not support estimation of the spatial Durbin specification (this must be implemented by hand by constructing the spatially lagged explanatory variables explicitly).\n",
+ "\n",
+ "The `spreg` module implements ML estimation of the spatial lag model in the `ML_Lag` class. Given the problems in the optimization of the log-likelihood for the SARSAR model (and the issues with interpretation of the results), ML estimation of this model is purposely not included. For the same reason, the general nested model is not implemented either. These models can be estimated by means of IV/GMM.\n",
+ "\n",
+ "The estimation of the Spatial Durbin model is implemented through the inclusion of the `slx_lags` argument. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9a2c9eb",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "As before, the main module is *spreg* for spatial regression analysis. From this, `OLS` and `ML_Lag` are imported. In addition, the utilities in *libpysal* (to open spatial weights and access the sample data set), *pandas* and *geopandas* are needed, as well as *time* (for some timing results), *matplotlib.pyplot* and *seaborn* for visualization. All of these rely on *numpy* as a dependency. Finally, in order to carry out the Likelihood Ratio tests, `likratiotest` is imported from `spreg.diagnostics`.\n",
+ "\n",
+ "The usual *numpy* `set_printoptions` is included as well."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "5ac490b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "import time\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from libpysal.io import open\n",
+ "from libpysal.examples import get_path\n",
+ "from libpysal.weights import lag_spatial\n",
+ "\n",
+ "from spreg import OLS, ML_Lag\n",
+ "from spreg.diagnostics import likratiotest\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0ee18820",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - read_file\n",
+ " - DataFrame\n",
+ " - head\n",
+ " - describe\n",
+ " - corr\n",
+ " \n",
+ "- from libpysal:\n",
+ " - io.open\n",
+ " - examples.get_path\n",
+ " - weights.lag_spatial\n",
+ " \n",
+ "- from numpy:\n",
+ " - hstack\n",
+ "\n",
+ "- from matplotlib/seaborn:\n",
+ " - regplot\n",
+ " - show\n",
+ "\n",
+ "- from spreg:\n",
+ " - spreg.OLS\n",
+ " - spreg.ML_Lag\n",
+ " - spreg.diagnostics.likratiotest"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "47934d1b-7587-4ddf-be38-83904eede8e8",
+ "metadata": {},
+ "source": [
+ "### Variable definition and data input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfb04af4-4521-4ac0-8e55-d3b92f54a403",
+ "metadata": {},
+ "source": [
+ "The data set and spatial weights are from the **chicagoSDOH** sample data set:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic indicators of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_q.gal**: queen contiguity weights\n",
+ "\n",
+ "To illustrate the methods, a descriptive model is used that relates the rate of uninsured households in a tract(for health insurance, **EP_UNINSUR**) to the lack of high school education (**EP_NOHSDP**), the economic deprivation index (**HIS_ct**), limited command of English (**EP_LIMENG**) and the lack of access to a vehicle (**EP_NOVEH**). This is purely illustrative of a spatial lag specification and does not have a particular theoretical or policy motivation.\n",
+ "\n",
+ "The file names and variable names are set in the usual manner. Any customization for different data sets/weights and different variables should be specified in this top cell."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "14a1e98e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "infileq = get_path(\"Chi-SDOH_q.gal\") # queen contiguity weights created with GeoDa\n",
+ "\n",
+ "y_name = 'EP_UNINSUR'\n",
+ "x_names = ['EP_NOHSDP','HIS_ct','EP_LIMENG','EP_NOVEH']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "w_name = 'Chi-SDOH_q'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9514984-a22e-4d5b-a993-f40771a290a0",
+ "metadata": {},
+ "source": [
+ "The `read_file` and `open` functions are used to access the sample data set and contiguity weights. The weights are row-standardized and the data frames for the dependent and explanatory variables are constructed. As before, this functionality is agnostic to the actual data sets and variables used, since it relies on the specification given in the initial block above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "3248d23d-2247-40bf-a3a5-3ba21b477aa8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "wq = open(infileq).read() \n",
+ "wq.transform = 'r' # row-transform the weights\n",
+ "y = dfs[y_name]\n",
+ "x = dfs[x_names]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2aea09bf",
+ "metadata": {},
+ "source": [
+ "## OLS and SLX with Spatial Diagnostics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb5eb774-814b-467e-ad51-11d21d3cee50",
+ "metadata": {},
+ "source": [
+ "Standard OLS and SLX regressions with spatial diagnostics are carried out to provide a point of reference. Moran's I is included by setting `moran=True` and, of course, `spat_diag=True` as well. Refer to the specific OLS notebook for further details."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fdb2b8ce-6ae8-4a15-a6c7-7ec23700dfc7",
+ "metadata": {},
+ "source": [
+ "### OLS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "379724ce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1 = OLS(y,x,w=wq,spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(ols1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d0e0f74-5877-4497-8e7b-79394b6701fd",
+ "metadata": {},
+ "source": [
+ "The specification achieves an acceptable $R^2$ of about 0.63 and all coefficients are positive and highly significant.\n",
+ "\n",
+ "The non-spatial diagnostics suggest non-normality as well as a hight degree of heteroskedasticity. There is no problem with multicollinearity.\n",
+ "\n",
+ "The spatial diagnostics against the SARERR alternatives show very significant LM-Lag and LM-Error, but of the two robust tests, only RLM-Lag is highly significant (RLM-Error only at p < 0.03). Hence, there is a strong indication that a Lag rather than an Error alternative may be appropriate. While the joint LM test is also highly significant, this is likely due to a strong one-sided (Lag) alternative.\n",
+ "\n",
+ "Interestingly, the diagnostics against a spatial Durbin alternative strongly support the latter as well. Both LM tests and their robust forms are highly significant, and so is the joint test. Moreover, the value for the robust forms of the test is smaller than the original, which is the expected behavior (although not always reflected in empirical practice).\n",
+ "\n",
+ "In sum, in addition to a spatial Lag model as an alternative, the spatial Durbin specification deserves consideration as well."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0cfc5d4e-a1b3-4b07-8120-2def922d8ef6",
+ "metadata": {},
+ "source": [
+ "### SLX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "53f9b90d-7f6d-4b3e-9e99-83430957fe3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slx1 = OLS(y,x,w=wq,slx_lags=1,spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(slx1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9ee2be15-2067-4caf-b02b-aca6b46118c5",
+ "metadata": {},
+ "source": [
+ "Relative to the classic regression model, the fit improves slightly, but the constant, **EP_NOHSDP** and **HIS_CT** become non-significant at p = 0.01 (they are marginally signifcant at p=0.05). All but one coefficient of the SLX terms are significant (**W_EP_NOVEH** is not). The signs and magnitudes of the SLX coefficients relative to their unlagged counterparts remain a bit confusing. Only for **EP_LIMENG** and **W_EP_LIMENG** are they the same, with the lag coefficient smaller than the unlagged one, in accordance with Tobler's law. The coefficient for **W_HIS_ct** is significant and larger than that of **HIS_ct**, while the latter is not significant at p = 0.01. In other words, the interpretation of these results in terms of distance decay and Tobler's law may be a bit problematic.\n",
+ "\n",
+ "In terms of diagnostics, there is a slight problem with multicollinearity (often the case in SLX specifications), strong non-normality and evidence of heteroskedasticity. Moran's I is significant, as are both LM-tests, but neither of the robust forms is significant. Based on the relative magnitudes of the test statistics, there is a slight indication of a possible Lag alternative, i.e., a spatial Durbin specification. However, this indication is not as strong as that provided by the LM-SDM test statistics in the classic specification."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8a0c186-dd30-4f64-9a5b-d95bb562ebde",
+ "metadata": {},
+ "source": [
+ "## ML Estimation of the Lag Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cc74112-a2eb-4efa-9116-d1e5197d20ed",
+ "metadata": {},
+ "source": [
+ "### Principle"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0385d805-bfb4-4e79-b0ed-0fc59c811926",
+ "metadata": {},
+ "source": [
+ "The point of departure of the Maximum Likelihood estimation of the spatial Lag model is an assumption of normal, independent and identically distributed errors. From this, the distribution for the observable vector of the dependent variable is obtained as multivariate normal.\n",
+ "\n",
+ "For the lag model $y = \\rho Wy + X\\beta + u$, the corresponding log-likelihood is:\n",
+ "\n",
+ "\n",
+ "$$\\ln L = \\ln | I - \\rho W | -(n/2)(\\ln 2\\pi) - (n/2) \\ln \\sigma^2 \\\\\n",
+ " - (1/2 \\sigma^2)(y - \\rho W y - X \\beta)'(y - \\rho W y - X \\beta)$$\n",
+ "\n",
+ "Except for the first term, this is identical to the log-likelihood in the classic regression model. The maximization of the classic log-likelihood would correspond to the minimization of the sum of squared residuals, in this case $(y - \\rho W y - X \\beta)'(y - \\rho W y - X \\beta)$, but this ignores the Jacobian term $\\ln | I - \\rho W |$. As a consequence, simple minimization of the sum of squared residuals (i.e., OLS), which ignores this Jacobian term, will yield biased estimates.\n",
+ "\n",
+ "Maximization of the log-likelihood is simplified since a *concentrated* likelihood can be derived that is only a function of the single parameter $\\rho$. Once an estimate for $\\rho$ is obtained, the corresponding estimates for $\\beta$ and $\\sigma^2$ are easily computed. For technical details, see Chapter 8 of Anselin and Rey (2014).\n",
+ "\n",
+ "Inference is based on an asymptotic variance matrix, which is computed as the inverse of the so-called information matrix (the expected value of the matrix of second partial derivatives of the log-likelihood function)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1bfd7648-bdd1-4ebb-bcf5-4dc74782693d",
+ "metadata": {},
+ "source": [
+ "### Implementation methods"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4ada2a0d-81fc-4b07-8ba9-d7e1bc101b1a",
+ "metadata": {},
+ "source": [
+ "ML estimation of the classic spatial lag model is implemented by means of `spreg.ML_Lag`, with all the standard regression arguments (i.e., at a minimum, **y**, **x** and **w**). Three different methods are implemented: `full`, `ord` an `LU`. These differ only in the way the Jacobian term $\\ln | I - \\rho W |$ is computed. As the logarithm of the determinant of a $n \\times n$ matrix, this calculation runs into numerical difficulties for large(r) data sets.\n",
+ "\n",
+ "The default optimization method is *brute force*, or `method=\"full\"`.\n",
+ "This uses dense matrix expressions to calculate the required determinants and inverse matrix terms.\n",
+ "This method should *not* be used for large(r) data sets.\n",
+ "\n",
+ "The Ord eigenvalue method, `method=\"ord\"` (Ord, 1975) uses the eigenvalues of the spatial weights matrix as a shortcut to\n",
+ "compute the Jacobian determinant. Since this method relies on eigenvalue computations, it also is\n",
+ "not reliable for large(r) data sets. The `method` argument must be included (since the default\n",
+ "is `method=\"full\"`).\n",
+ "\n",
+ "The `method=\"LU\"` uses the LU (Cholesky) matrix decomposition for sparse matrices to efficiently\n",
+ "compute the Jacobian determinant for large data sets. The sparse matrix conversion is\n",
+ "done internally, so the only needed additional argument is `method = \"LU\"`. This is the only reliable method for larger data sets."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "75672c3c-9170-4135-bdaf-2a335eec8090",
+ "metadata": {},
+ "source": [
+ "The ML estimation is illustrated for the same specification as before, first using `method=\"full\"`. Since this is also the default, it is not necessary to explicitly include this argument, but it is listed here for clarity. To compare the relative speed of the different methods, `time.time()` is used.\n",
+ "\n",
+ "In addition, since the impacts calculation is set to `simple` by default, it is turned off for now by means of `spat_impacts = None` (see below for more specifics)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71307ef9-cc53-46a6-866e-5bd521e4503e",
+ "metadata": {},
+ "source": [
+ "#### Method `full`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1975d6b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "lag1a = ML_Lag(y,x,w=wq,method=\"full\",\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts=None)\n",
+ "t1 = time.time()\n",
+ "print(\"Time in seconds: \",t1-t0)\n",
+ "print(lag1a.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ec92847a-2982-4d67-a514-da927b0fd7f6",
+ "metadata": {},
+ "source": [
+ "The spatial autoregressive coefficient (**W_EP_UNINSUR**) at 0.39 is highly significant. The effect of its inclusion on the other coefficient estimates is major as well. All are substantially smaller than in the classic regression, except for the coefficient of **EP_NOVEH**, which is marginally larger. The main effect is on the constant term and the coefficient of **EP_NOHSDP**, neither of which is any longer significant. In essence, ignoring the spatial spillover effects in the classic regression means that some of these spillovers are reflected in the regression coefficients, which become biased with respect to their true magnitude. The spillover effects are considered more closely in the discussion of impacts below.\n",
+ "\n",
+ "In addition to the coefficient estimates, the output includes information about the model fit. There are two **Pseudo R-squared** measures: one is based on the *naive residuals*, $e = y - \\hat{\\rho} Wy - X\\hat{\\beta}$, the other (**Spatial Pseudo R-squared**) is computed from the forecasting errors when using the reduced form to compute predicted values. The two types of predicted values and residuals are included in the regression object as **predy** and **u** for the naive form and **predy_e** and **e_pred** for the reduced form results. As is typically the case, the measure of fit based on the reduced form predicted values is (slightly) worse than the naive one.\n",
+ "\n",
+ "Other indications of the fit of the model (although strictly speaking not measures of fit) are the **Log Likelihood** (-2418.99), the **Akaike info criterion** (4849.97), and the **Schwarz criterion**, also sometimes referred to as BIC (4878.01). Compared to the results for the classic regression (respectively -2465.21, 4940.42, and 4963.79), the log-likelihood is clearly less negative (thus larger) and the AIC and SC are smaller (better) than their counterparts.\n",
+ "\n",
+ "Other interesting attributes of the regression object are the regression coefficients, in **betas**, with the spatial autoregressive coefficient as the last element. The latter is also included separately as **rho**. The standard errors are in **std_err**, z-statistics and p-values in **z_stat**, and the complete variance-covariance matrix is **vm**."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9866e7c6-4fc9-4cf4-af2f-7587b6c2a490",
+ "metadata": {},
+ "source": [
+ "The contents of the **betas** and **rho** attributes show how the estimate for $\\rho$ is also the last element in **betas**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "386e6b44-9f51-4572-9ea1-7fa98f7beb4f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "print(\"betas \",lag1a.betas)\n",
+ "print(\"rho \",lag1a.rho)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2d3b5229-f169-4d15-aafe-416f325183a5",
+ "metadata": {},
+ "source": [
+ "#### Method `ord`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6ba06a23-111a-430e-bd60-ce54ee60a660",
+ "metadata": {},
+ "source": [
+ "The Ord eigenvalue method is invoked by means of `method=\"ord\"`. All other attributes are the same as before (with again `spat_impacts = None`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e1ec4ab7-5eed-4c14-b046-2f9efff13642",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "lag1b = ML_Lag(y,x,w=wq,method=\"ord\",\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts=None)\n",
+ "t1 = time.time()\n",
+ "print(\"Time in seconds: \",t1-t0)\n",
+ "print(lag1b.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff1ea981-669d-4ba0-a200-6f41c17717f0",
+ "metadata": {},
+ "source": [
+ "The coefficient estimates are identical to those obtained with the `full` method. There are some slight differences in the computed standard errors (and thus also in the z-values and p-values), but the overall effect is minimal. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0439c7c-771a-446f-8677-51570d7f8dea",
+ "metadata": {},
+ "source": [
+ "#### Method `LU`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "846b5ac1-e4b0-4b99-8a56-7663403935d1",
+ "metadata": {},
+ "source": [
+ "Again, all arguments are the same, except for `method = \"LU\"`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e485647-951e-4b62-80be-b1bb505b5027",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "lag1c = ML_Lag(y,x,w=wq,method=\"LU\",\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts=None)\n",
+ "t1 = time.time()\n",
+ "print(\"Time in seconds: \",t1-t0)\n",
+ "print(lag1c.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34682841-1194-4f32-a14d-1aea4ce6b9e3",
+ "metadata": {},
+ "source": [
+ "In this case, the estimation results are identical to those for the `full` method."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a9f084bf-276f-44a7-863b-41835831d7ee",
+ "metadata": {},
+ "source": [
+ "## Spatial Multipliers - Impacts"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b6ed0760-3ba2-4f0e-a06d-a295c13e29b4",
+ "metadata": {},
+ "source": [
+ "In models that include a spatially lagged dependent variable Wy, with or without additional spatially lagged explanatory variables, the\n",
+ "impact of a change in X on y is not simply the coefficient of X, as is the case in the standard regression model. Instead, the effect that results from changes in the neighboring values must also be accounted for. These are the *spatial multipliers*, *indirect effects* or *spatial impacts*.\n",
+ "\n",
+ "In Kim, Phipps and Anselin (2003), it was shown that if the change in the explanatory variable is uniform across observations, the *spatial multiplier* is $1 / (1- \\rho)$, with the total effect of a change in variable\n",
+ "$x_k$ amounting to $\\beta_k / (1 - \\rho)$. In the example, the spatial multiplier in the spatial lag model (**lag1a**) would be \n",
+ "1.0 / (1.0 - 0.39151) = 1.643.\n",
+ "\n",
+ "The Kim et al. approach distinguishes between the direct effect, i.e., the coefficient of the $\\beta$ coefficients as estimated, and the total effect, which corresponds to this coefficient times the multiplier. An indirect effect is then the difference between the two.\n",
+ "\n",
+ "LeSage and Pace (2009) introduce a slightly different set of concepts and use the terms average direct impact ($ADI$), average indirect impact ($AII$) and average total impact ($ATI$) as summaries computed from the matrix expression $(I - \\rho W)^{-1}$ in the reduced form, $(I - \\rho W)^{-1}X\\beta$. The main difference is that what they refer to as *direct* effect also includes some feedbacks as reflected in the diagonal elements of the inverse matris. As a result, in their approach, the direct effects will differ from the estimates for $\\beta$. \n",
+ "\n",
+ "More formally, LeSage-Pace define $ADI$ as the average trace of the inverse matrix, or, $ADI = (1/n) tr[(I - \\rho W)^{-1}] = (1/n) \\sum_i (I - \\rho W)^{-1}_{ii}$. The $ATI$ is the average of all the elements of the matrix, or, $ATI = (1/n) \\sum_i \\sum_j (I - \\rho W)^{-1}$. Note that with some algebra, one can show that this equals $1 / (1 - \\rho)$, the same as the total multiplier in the Kim et al. approach.\n",
+ "\n",
+ "The $AII$ then follows as $ATI - ADI$. The actual impacts are obtained by multiplying the $\\beta$ coefficient by respectively $ATI$, $ADI$ and $AII$.\n",
+ "\n",
+ "The impact measures are listed in the spatial lag regression output when the `spat_impacts` argument is specified (it is by default set to `spat_impacts = \"simple\"`). Options include `simple` (Kim et al. approach), `full` and `power` (both based on LeSage-Pace, but with `full` using a dense matrix computation for the inverse, whereas `power` uses a power approximation with higher order weights), as well as `all`, for all three. In addition, any combination of methods can be passed in a list. For example, to obtain both Kim et al. and LeSage-Pace measures, the argument can be set as\n",
+ "`spat_impacts = [\"simple\",\"full\"]`, as in the listing below. Since the default setting is `spat_impacts = \"simple\"`, when listing the impacts is not desired, `spat_impacts` must explicitly be set to `None`.\n",
+ "\n",
+ "Based on extensive timing experiments for the LeSage-Pace approach, the `power` method is superior for data sets with more than 5,000 observations. For larger data sets, it quickly becomes the only viable option, being orders of magnitude faster than the brute force calculations. The Kim et al. approach has no such limitation, since it does not use any matrix calculations.\n",
+ "\n",
+ "Note that the reported impacts are only *average* effects. See the spatial multipliers notebook for a more extensive analysis of the associated spatial pattern."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4cb376e5-d8b2-4aaf-aad9-e3c659613419",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag2 = ML_Lag(y,x,w=wq,method=\"full\",\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts=['simple','full'])\n",
+ "print(lag2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc10afcc-0360-4c76-9482-dc6ba2f57153",
+ "metadata": {},
+ "source": [
+ "The listing of **SPATIAL LAG MODEL IMPACTS** for the `simple` and `full` methods clearly illustrates the slight differences between the two approaches. The **Total** effect is the same for both, but the distribution of the effect between **Direct** and **Indirect** is slightly different. For the `simple` method, the **Direct** effects are identical to the coefficient estimates, but for the `full` method, they are slightly larger. This is due to the use of the diagonal elements of the inverse matrix, rather than the original estimates. As a result, some of the spillover feed-back effects are characterized as **Direct**. As a consequence, the part attributed to the **Indirect** effect is larger for the `simple` method than for the `full` method.\n",
+ "\n",
+ "These measures should only be interpreted as rough indications of spatial spillovers since they are both based on a rather unrealistic assumption of a uniform change in the X-variable. Also, the **Total** effect is simply the original coefficient times the spatial multiplier. For example, for **EP_LIMENG**, this amounts to $0.3116 \\times 1.643 = 0.512$.\n",
+ "\n",
+ "This total effect can be compared to that implied by the SLX model, which would be the sum of $\\beta$ and $\\gamma$. For example, for **EP_LIMENG**, this would be $0.38547 + 0.22040 = 0.60587$, which is actually larger than the total effect suggested by the spatial lag model. In part, this can be explained by recognizing that the SLX model is a truncated form of the reduced form for the spatial lag specification, i.e., only the first order contiguity elements are included. When ignoring the remainder, its impact tends to be attributed to the coefficients of $\\beta$ and $\\gamma$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e768c5f-9216-48f7-ac1f-f25c2d7e0a43",
+ "metadata": {},
+ "source": [
+ "## Predicted Values and Residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1c4af1a9-aba6-4b50-b4f1-f45694e634c4",
+ "metadata": {},
+ "source": [
+ "As mentioned, the *naive* predicted values and residuals are attributes of the regression object as **predy** and **u**. These are somewhat misleading, since they take the value of $Wy$ as observed, in a so-called *conditional* approach. In the *simultaneous* spatial lag model, the spatial pattern for the dependent variable $y$ is jointly determined as a function of the X-variables only. A predicted value that reflects this lesser degree of information is computed from the reduced form, as $y_{pr} = (I - \\hat{\\rho} W)^{-1} X\\hat{\\beta}$. This is **predy_e** in the regression object. The associated forecast error, $y - y_{pr}$ is **e_pred** in the regression object.\n",
+ "\n",
+ "The two types of predicted values and residuals can be readily turned into a data frame by means of `pd.DataFrame` applied to an array constructed with `np.hstack`, in the same way as was done for the OLS predicted values and residuals. In the example, the associated variable names are **ypred**, **yreduce**, **resid** and **forcerr**, passed as the `columns` argument.\n",
+ "\n",
+ "Descriptive statistics are obtained with `describe()`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "343c44c2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preds = pd.DataFrame(np.hstack((lag1a.predy,lag1a.predy_e,lag1a.u,lag1a.e_pred)),columns=['ypred','yreduce','resid','forcerr'])\n",
+ "preds.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9bd86573-c074-4aec-9d51-13314ca79451",
+ "metadata": {},
+ "source": [
+ "Note some important differences between the two concepts. First, whereas the mean of **ypred** equals the mean of the actual dependent variable of 18.4852 (see **Mean dependent var** in the regression output listing), the mean of the reduced form prediction is slightly different (18.5128). Consequently, the mean of **resid** is essentially zero, but the mean of **forcerr** is slightly negative, at -0.0375. There are other slight differences as well."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "18400a28-a42c-46ad-9657-f17932d67a68",
+ "metadata": {},
+ "source": [
+ "The correlation between the two concepts is high, but not perfect, respectively 0.988 for the predicted values and 0.978 for the residuals."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "44974590-467f-49ed-beec-9725184c233e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Correlation between predicted values: {preds['ypred'].corr(preds['yreduce']):0.3f}\")\n",
+ "print(f\"Correlation between residuals: {preds['resid'].corr(preds['forcerr']):0.3f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "946ad797-705e-4782-9a46-f0865292e917",
+ "metadata": {},
+ "source": [
+ "#### Spatial pattern of residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20a5a87f-e2b2-4149-902c-7a2560b6da56",
+ "metadata": {},
+ "source": [
+ "A final interesting comparison is between the spatial pattern of the two types of residuals. To assess this, a simple Moran scatterplot is constructed, where the spatial lag is computed by means of `libpysal.lag_spatial`. The plot itself is constructed with `sns.regplot`, which superimposes a regression line on the scatter plot of the spatial lag on the original variable. No customization of the graph is carried out.\n",
+ "\n",
+ "For the *naive* residuals, this yields the following plot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0ecebb8e-d396-4e1c-8092-b6db612aebee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "werr = lag_spatial(wq,preds['resid']).reshape(-1,1)\n",
+ "sns.regplot(x=preds['resid'],y=werr)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd782036-bd82-45d5-90c7-2d527d46fd61",
+ "metadata": {},
+ "source": [
+ "The regression line is essentially flat, which means most/all of the remaining spatial correlation has been eliminated. In contrast, the Moran scatterplot for the prediction error shows a strong positive slope, suggesting remaining spatial clustering."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "245082da-2703-4282-bf5b-24c58f83edac",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wfor = lag_spatial(wq,preds['forcerr']).reshape(-1,1)\n",
+ "sns.regplot(x=preds['forcerr'],y=wfor)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "12ae004f-e7cc-47a1-85cc-6426152ff8d3",
+ "metadata": {},
+ "source": [
+ "Why would this be the case? Recall that the predicted value is computed as $y_{pr} = (I - \\hat{\\rho} W)^{-1} X\\hat{\\beta}$. However, the complete expression for the reduced form is $y = (I - \\rho W)^{-1} X\\beta + (I - \\rho W)^{-1}u$. Since the error term is unobserved and its mean is zero, the second term is ignored in the predicted value. As a result, the actual difference between $y$ and $y_{pr}$ is $e = (I - \\rho W)^{-1}u$. This can be written as $e = \\rho We + u$, a spatial autoregressive process, which accounts for the spatial pattern in the residuals.\n",
+ "\n",
+ "Since the residual is *not* filtered for the existing spatial correlation, it will remain spatially correlated itself. This is reflected in the positive slope of the regression line in the Moran scatter plot."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ed8bd5bc-4921-40f3-96c3-79dfd68ed92d",
+ "metadata": {},
+ "source": [
+ "#### Mapping predicted values and residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "54225ac9-bdc4-4560-a62f-c2761b8ed963",
+ "metadata": {},
+ "source": [
+ "Optionally, the predicted values and residuals can be added to the spatial data frame in order to construct associated maps. However, since these maps create only visual impressions of spatial patterning, this is not further pursued here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "708ffa42-763f-4ee2-acf0-2ab079795cfd",
+ "metadata": {},
+ "source": [
+ "## ML Estimation of Spatial Durbin Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5a27b8d3-13cb-4e6d-aef2-549f19c6dc6f",
+ "metadata": {},
+ "source": [
+ "ML estimation of the Spatial Durbin model is a special case of `spreg.ML_Lag`, with the additional argument of `slx_lags=1` (or a larger value). Everything else remains the same. More specifically, the three methods of `full`, `ord` and `LU` are again available. Only the default `full` is considered here. The results are essentially the same for the other methods. \n",
+ "\n",
+ "To illustrate the difference between the two types of impact measures, `spat_impacts` is set to `[\"simple\",\"full\"]` (note, the default setting remains `spat_impacts = \"simple\"`).\n",
+ "\n",
+ "Another default setting is `spat_diag = True`, which yields the results for the Common Factor Hypothesis test. To avoid this test, `spat_diag` must be set to `False`. In the illustration, both arguments are listed explicitly for clarity."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d5bc8e4-3683-4bfd-a904-f944cc69a71b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "spdur = ML_Lag(y,x,w=wq,slx_lags=1,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts = ['simple','full'],\n",
+ " spat_diag=True)\n",
+ "t1 = time.time()\n",
+ "print(\"Time in seconds: \",t1-t0)\n",
+ "print(spdur.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73b650f7-2272-4f1a-8d2b-e6c007f45d3a",
+ "metadata": {},
+ "source": [
+ "The inclusion of the spatial lag terms affects the results relative to both the original classic specification and the SLX model. The spatial autoregressive coefficient of 0.40 is highly significant. Similar to what happened in the SLX model relative to the classic specification, **EP_NOHSDP** and **HIS_ct** are no longer significant and neither is **W_EP_NOVEH**, but now **W_EP_LIMENG** is also no longer significant. As in the SLX specification, some of the signs and magnitudes of the WX coefficients run counter to Tobler's Law. According to the latter, the coefficients of $\\beta$ and the matching $\\gamma$ should be the same, which is not the case for **EP_NOHSDP** (but not significant) and **EP_NOVEH** (but the lag is not significant). A consistent pattern of opposite signs may be an indication that the common factor hypothesis holds (see below).\n",
+ "\n",
+ "Relative to the standard Lag model, the inclusion of the WX terms makes the spatial autoregressive coefficient slightly larger (0.40 relative to 0.39), but it renders **HIS_ct** non-significant (this is similar to what happened in the SLX model relative to the classic specification).\n",
+ "\n",
+ "Significance improves slightly relative to the SLX model, with a Log Likelihood of -2410.7 (compared to -2442.8), AIC of 4841.5 (relative to 4903.5), and SC of 4888.2 (relative to 4945.6).\n",
+ "\n",
+ "As in the spatial Lag model, there will be two types of predicted values and residuals, respectively based on a naive approach and the reduced form. This is not considered further since the treatment and interpretation are identical to that in the standard spatial Lag model.\n",
+ "\n",
+ "Further refinements of the model specification can be carried out by eliminating some lag terms by means of `slx_vars`, as in the SLX model. This is not further pursued here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f3b96dfc-3502-4bc9-a3be-5b2b3e211a9e",
+ "metadata": {},
+ "source": [
+ "### Spatial Multipliers - Impacts"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e33def78-6d52-4c1d-b917-44bde828ab81",
+ "metadata": {},
+ "source": [
+ "The impact measures for the Spatial Durbin model follow the same logic as in the spatial lag model. They are derived from the reduced form, which is now, ignoring the error term:\n",
+ "\\begin{equation*}\n",
+ "y = (I - \\rho W)^{-1} X \\beta + (I - \\rho W)^{-1} WX \\gamma.\n",
+ "\\end{equation*}\n",
+ "The effect of this is that the various multipliers must be applied to both $\\beta$ and the matching $\\gamma$ to compute the overall impacts.\n",
+ "\n",
+ "As it turns out, the total multiplier is the same as in the lag model, i.e., $1.0 / (1.0 - \\rho)$. However, to get the total effect, this factor needs to be multiplied by both $\\beta$ and the matching $\\gamma$.\n",
+ "\n",
+ "For example, taking $\\rho = 0.4044$ as in the example, the total multiplier follows as $1.673$. The total impact for the variable **EP_LIMENG** (ignoring for now that **W_LIMENG** turned out to be non-significant) would be $1.673 \\times 0.37397 + 1.673 \\times 0.01243 = 0.6464$, the value given in the **Total** column of the impacts listing. As in the lag model, the total impact is the same for the Kim et al approach and the LeSage-Pace approach. The main difference is in the way the direct effect is computed.\n",
+ "\n",
+ "In the Kim et al approach, the direct effect is the coefficient of $\\beta$. This is a strict interpretation of direct effect, i.e., it considers the effect of $\\gamma$ to be indirect. This is consistent with the interpretation of $\\gamma$ in the SLX model, but it is not the approach used by LeSage-Pace in their original treatment. In their formulation, the $ADI$ is applied to both $\\beta$ and $\\gamma$ to compute the direct effect, typically yielding a larger value for the direct effect (as long as $\\beta$ and $\\gamma$ have the same sign). This approach is *not* followed by `spreg`. Instead, the direct effect is obtained by multiplying the $ADI$ with the $\\beta$ coefficient only. As a result, the share attributed to the indirect impact will be larger than in original the LeSage-Pace formulation (in contrast, the latter is followed in the `R` `spatialreg` package).\n",
+ "\n",
+ "The total and direct/indirect impacts can be compared to those suggested by the simple spatial lag model and the SLX model. For example, for **EP_LIMENG**, the total impact of a uniform change in that variable was 0.606 in SLX, 0.512 in the spatial lag model and 0.646 in the spatial Durbin specification. Of this (using the Kim et al logic), 0.220 was spatial spillover (indirect effect) in SLX, 0.201 in spatial lag, and 0.272 in the spatial Durbin model.\n",
+ "\n",
+ "As mentioned above, these impact measures are only summaries. A more meaningful indication of spatial spillovers would follow from a non-uniform change in (some of) the X variables through the use of the full reduced form. In addition, the average may mask some interesting spatial patterns, as demonstrated in an earlier notebook."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "38c65c92-c451-4e82-885e-4fe7212d3280",
+ "metadata": {},
+ "source": [
+ "### Common Factor Hypothesis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "35b1381e-bf61-48df-924f-1623fb8169c0",
+ "metadata": {},
+ "source": [
+ "A complication in the interpretation of the spatial Durbin model occurs because the spatial autoregressive Error model has an equivalent counterpart\n",
+ "as a spatial Lag formulation that includes WX terms, i.e., the same specification as the spatial Durbin model. Formally:\n",
+ "\\begin{equation*}\n",
+ "y = X \\beta + (I - \\lambda W)^{-1} u = \\lambda Wy + X \\beta - \\lambda WX \\beta + u.\n",
+ "\\end{equation*}\n",
+ "In this alternative specification, the coefficients for the $WX$ variables correspond to the negative product of the autoregressive\n",
+ "and the matching regression parameters (except for the constant term), the so-called *common factor constraint*. \n",
+ "\n",
+ "Following Anselin(1988), the test on the common factor hypothesis $H_0: \\rho \\beta^* + \\gamma = 0$\n",
+ "(with $\\beta^*$ as the vector of regression coefficients without the constant term) consists of three elements:\n",
+ "\n",
+ "- the constraint as a $h \\times 1$ vector $g = \\hat{\\rho} \\hat{\\beta}^* + \\hat{\\gamma}$ (with $h = k-1$)\n",
+ "- a $2h+1 \\times h$ matrix of partial derivatives:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "G = \\begin{bmatrix} \n",
+ "\\hat{\\rho} \\times I_h\\\\\n",
+ "I_h\\\\\n",
+ "\\hat{\\beta'}^*\n",
+ "\\end{bmatrix}\n",
+ "\\end{equation*}\n",
+ "- and the $2h+1 \\times 2h+1$ square asymptotic variance matrix $V$ from the estimated spatial Durbin model (**vm** in the regression object)\n",
+ "\n",
+ "Using the delta method, the common factor statistic then follows as:\n",
+ "\\begin{equation*}\n",
+ "CF = g'[G'VG]^{-1}g \\sim \\chi^2(h).\n",
+ "\\end{equation*}\n",
+ "For our purposes, this suffices, although it should be noted that Juhl(2021) has pointed out potential problems due to the lack of\n",
+ "invariance of the Wald test to different reparameterizations of the null hypothesis."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "77471c34-f1a5-4cfe-9ece-f0ec8b5cc618",
+ "metadata": {},
+ "source": [
+ "A test on the common factor hypothesis is included in the spatial Durbin output when `spat_diag = True`, which is the default for this specification.\n",
+ "\n",
+ "In the example above, the value of the test statistic is 32.954, which strongly rejects the null. Some indication of the failure of the common factor hypothesis could also be gleaned from the lack of opposite signs of $\\beta$ and $\\gamma$, which is necessary for the constraint to hold with a positive $\\rho$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9f5f5846-51fa-4ad4-985f-c11a68d2f380",
+ "metadata": {},
+ "source": [
+ "### Likelihood-Ratio Tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab69fd57-726f-4f4d-8bd2-2273235feaa9",
+ "metadata": {},
+ "source": [
+ "A likelihood ratio test is $LR = 2.0 \\times (LogL_1 - LogL_0)$, where $LogL_1$ is the log-likelihood for the *unrestricted* model (i.e., with more non-zero parameters), and $LogL_0$ is the log-likelihood for the *restricted* model (i.e., where some parameters, like $\\rho$, are set to zero). For example, a likelihood ratio test on the coefficient $\\rho$ in the spatial lag model would use the log likelihood in the spatial lag model as $LogL_1$, and the log-likelihood from the classic regression as $LogL_0$. \n",
+ "\n",
+ "The $LR$ statistic is distributed as a Chi-square random variable with degrees of freedom equal to the number of restrictions, i.e., 1 for the spatial autoregressive coefficient, but more for the SLX and spatial Durbin models, depending on how many explanatory variables are included. The LR tests are an alternative to the Wald tests (asymptotic t-values) on the spatial coefficient and the LM tests for spatial effects considered earlier.\n",
+ "\n",
+ "A likelihood ratio test is implemented as `spreg.diagnostics.likratiotest`. Its two arguments are the regression object for the constrained model and the regression object for the unconstrained model. The result is a dictionary with the statistic (`likr`), the degrees of freedom (`df`) and the p-value (`p-value`)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc1e3f8e-4901-4c8e-8cf9-5e41460d40fa",
+ "metadata": {},
+ "source": [
+ "Four different LR test consider the following constraints:\n",
+ "- Lag vs OLS, i.e., $\\rho = 0$ in the Lag model: arguments are **ols1** and **lag2**\n",
+ "- SDM vs OLS, i.e., both $\\rho = 0$ and $\\gamma = 0$ in the spatial Durbin model: argumentes are **ols1** and **spdur**\n",
+ "- SDM vs Lag, i.e., $\\gamma = 0$ in the spatial Durbin model: arguments are **lag2** and **spdur**\n",
+ "- SDM vs SLX, i.e., $\\rho = 0$ in the spatial Durbin model: arguments are **slx1** and **spdur**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c3b9c76",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "LR_Lag = likratiotest(ols1,lag2)\n",
+ "LR_SDMO = likratiotest(ols1,spdur)\n",
+ "LR_SDML = likratiotest(lag2,spdur)\n",
+ "LR_SDMS = likratiotest(slx1,spdur)\n",
+ "\n",
+ "print(f\"LR statistic Lag-OLS: {LR_Lag[\"likr\"]:0.3f}, d.f. {LR_Lag[\"df\"]:2d}, p-value {LR_Lag[\"p-value\"]:0.4f}\")\n",
+ "print(f\"LR statistic SDM-OLS: {LR_SDMO[\"likr\"]:0.3f}, d.f. {LR_SDMO[\"df\"]:2d}, p-value {LR_SDMO[\"p-value\"]:0.4f}\")\n",
+ "print(f\"LR statistic SDM-Lag: {LR_SDML[\"likr\"]:0.3f}, d.f. {LR_SDML[\"df\"]:2d}, p-value {LR_SDML[\"p-value\"]:0.4f}\")\n",
+ "print(f\"LR statistic SDM-SLX: {LR_SDMS[\"likr\"]:0.3f}, d.f. {LR_SDMS[\"df\"]:2d}, p-value {LR_SDMS[\"p-value\"]:0.4f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ca6ed963-0f88-4354-8236-a94beba50e4a",
+ "metadata": {},
+ "source": [
+ "In the current example, all null hypotheses are strongly rejected. Based on this evidence alone, it would suggest that the most appropriate specification is the spatial Durbin model. However, conflicts with the signs and magnitudes of the coefficients make that model difficult to interpret. Most importantly, several parameters violate Tobler's law, one of the most important tenets of spatial analysis. While in and of itself this is not sufficient to dismiss the model specification, it does require careful consideration of the interpretation.\n",
+ "\n",
+ "The Likelihood Ratio, Wald (the square of the asymptotic t-ratio) and Lagrange Multiplier test statistics are considered to be *classic* tests. Asymptotically, they are equivalent, but in finite samples, they tend to follow the order LM < LR < W.\n",
+ "\n",
+ "For the lag model in this example, the LM-Lag test statistic was 109.463, the Wald test was 9.922^2 or 98.446, and the LR test (above) 92.446. Whereas the LR and Wald test follow the prescribed order, the LM-Lag test does not, which may point to potential remaining specification problems.\n",
+ "\n",
+ "As mentioned, the model can be refined by selectively setting `slx_vars`, but this is not pursued here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b086a22",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "While the ML estimation paradigm is very powerful, it also is not robust to various forms of misspecification. This is difficult to consider when the results are viewed in isolation, but it is important to keep in mind. As practice, different model specifications could be considered, including adding additional explanatory variables, selectively removing some lag terms, and using different spatial weights. Make sure to carefully consider the interpretation of the estimated coefficients and associated direct and indirect effects."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/14_IV_estimation_spatial_lag.ipynb.txt b/_sources/notebooks/14_IV_estimation_spatial_lag.ipynb.txt
new file mode 100644
index 00000000..1c0e2127
--- /dev/null
+++ b/_sources/notebooks/14_IV_estimation_spatial_lag.ipynb.txt
@@ -0,0 +1,670 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "89145993",
+ "metadata": {},
+ "source": [
+ "# Instrumental Variables Estimation - Spatial Lag Model\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/19/2024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a85427f9",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "An alternative to maximum likelihood is to tackle the endogeneity of the spatially lagged dependent variable by means of instrumental variables (IV) estimation. This is implemented in the `spreg.GM_Lag` class. As before, the estimation of the Spatial Durbin model is achieved through the inclusion of the `slx_lags` argument. \n",
+ "\n",
+ "Distinct from what is possible for maximum likelihood, other endogenous variables can be included as well, using the familiar **yend** (for the endogenous variables) and **q** (for the instruments) arguments.\n",
+ "\n",
+ "The treatment in this notebook will focus on the specific properties of the IV estimation. Generic properties of the spatial lag model, such as the different predicted values, residuals, and the impact measures will not be treated in detail again. Technical aspects pertaining to these issues are covered in the maximum likelihood notebook."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9a2c9eb",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "As before, the main module is *spreg* for spatial regression analysis. From this, `OLS` and `GM_Lag` are imported. In addition, the utilities in *libpysal* (to open spatial weights and access the sample data set), *pandas* and *geopandas* are needed. All of these rely on *numpy* as a dependency. \n",
+ "\n",
+ "The usual *numpy* `set_printoptions` is included as well."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "5ac490b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "from libpysal.io import open\n",
+ "from libpysal.examples import get_path\n",
+ "from libpysal import weights\n",
+ "\n",
+ "from spreg import OLS, GM_Lag\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0ee18820",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - read_file\n",
+ " \n",
+ "- from libpysal:\n",
+ " - io.open\n",
+ " - examples.get_path\n",
+ " - weights.distance.Kernel\n",
+ " \n",
+ "- from spreg:\n",
+ " - spreg.OLS\n",
+ " - spreg.GM_Lag"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "47934d1b-7587-4ddf-be38-83904eede8e8",
+ "metadata": {},
+ "source": [
+ "### Variable definition and data input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfb04af4-4521-4ac0-8e55-d3b92f54a403",
+ "metadata": {},
+ "source": [
+ "As in the maximum likelihood notebook, the data set and spatial weights are from the **chicagoSDOH** sample data set:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic indicators of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_q.gal**: queen contiguity weights\n",
+ "- **Chi-SDOH_k10tri.kwt**: triangular kernel weights based on a variable bandwidth with 10 nearest neighbors from `GeoDa`\n",
+ "\n",
+ "The same descriptive model is used that relates the rate of uninsured households in a tract(for health insurance, **EP_UNINSUR**) to the lack of high school education (**EP_NOHSDP**), the economic deprivation index (**HIS_ct**), limited command of English (**EP_LIMENG**) and the lack of access to a vehicle (**EP_NOVEH**). This is purely illustrative of a spatial lag specification and does not have a particular theoretical or policy motivation.\n",
+ "\n",
+ "In an alternative specification, **HIS_ct** is considered to be endogenous, with, as before, **COORD_X** and **COORD_Y** as instruments.\n",
+ "\n",
+ "The file names and variable names are set in the usual manner. Any customization for different data sets/weights and different variables should be specified in this top cell."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "14a1e98e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "infileq = get_path(\"Chi-SDOH_q.gal\") # queen contiguity weights created with GeoDa\n",
+ "infilek = get_path(\"Chi-SDOH_k10tri.kwt\") # triangular kernel weights\n",
+ "\n",
+ "y_name = 'EP_UNINSUR'\n",
+ "x_names = ['EP_NOHSDP','HIS_ct','EP_LIMENG','EP_NOVEH']\n",
+ "xe_names = ['EP_NOHSDP','EP_LIMENG','EP_NOVEH']\n",
+ "yend_names = ['HIS_ct']\n",
+ "q_names = ['COORD_X','COORD_Y']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "w_name = 'Chi-SDOH_q'\n",
+ "wk_name = 'Chi-SDOH_k10tri'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9514984-a22e-4d5b-a993-f40771a290a0",
+ "metadata": {},
+ "source": [
+ "The `read_file` and `open` functions are used to access the sample data set and contiguity weights. The contiguity weights are row-standardized, the `class` of the kernel weights adjusted, and the data frames for the dependent and explanatory variables are constructed. As before, this functionality is agnostic to the actual data sets and variables used, since it relies on the specification given in the initial block above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "3248d23d-2247-40bf-a3a5-3ba21b477aa8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "wq = open(infileq).read() \n",
+ "wq.transform = 'r' # row-transform the weights\n",
+ "wk = open(infilek).read()\n",
+ "wk.__class__ = weights.distance.Kernel\n",
+ "y = dfs[y_name]\n",
+ "x = dfs[x_names]\n",
+ "yend = dfs[yend_names]\n",
+ "xe = dfs[xe_names]\n",
+ "q = dfs[q_names]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2aea09bf",
+ "metadata": {},
+ "source": [
+ "## OLS and SLX with Spatial Diagnostics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb5eb774-814b-467e-ad51-11d21d3cee50",
+ "metadata": {},
+ "source": [
+ "For easy reference, standard OLS and SLX regressions with spatial diagnostics are repeated here to provide a point of reference. The results are identical to those reported in the maximum likelihood notebook\n",
+ "\n",
+ "Refer to the specific OLS notebook for further details."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fdb2b8ce-6ae8-4a15-a6c7-7ec23700dfc7",
+ "metadata": {},
+ "source": [
+ "### OLS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "379724ce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1 = OLS(y,x,w=wq,spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(ols1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d0e0f74-5877-4497-8e7b-79394b6701fd",
+ "metadata": {},
+ "source": [
+ "The specification achieves an acceptable $R^2$ of about 0.63 and all coefficients are positive and highly significant.\n",
+ "\n",
+ "The non-spatial diagnostics suggest non-normality as well as a hight degree of heteroskedasticity. There is no problem with multicollinearity.\n",
+ "\n",
+ "The spatial diagnostics against the SARERR alternatives show very significant LM-Lag and LM-Error, but of the two robust tests, only RLM-Lag is highly significant (RLM-Error only at p < 0.03). Hence, there is a strong indication that a Lag rather than an Error alternative may be appropriate. While the joint LM test is also highly significant, this is likely due to a strong one-sided (Lag) alternative.\n",
+ "\n",
+ "Interestingly, the diagnostics against a spatial Durbin alternative strongly support the latter as well. Both LM tests and their robust forms are highly significant, and so is the joint test. Moreover, the value for the robust forms of the test is smaller than the original, which is the expected behavior (although not always reflected in empirical practice).\n",
+ "\n",
+ "In sum, in addition to a spatial Lag model as an alternative, the spatial Durbin specification deserves consideration as well."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0cfc5d4e-a1b3-4b07-8120-2def922d8ef6",
+ "metadata": {},
+ "source": [
+ "### SLX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "53f9b90d-7f6d-4b3e-9e99-83430957fe3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slx1 = OLS(y,x,w=wq,slx_lags=1,spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(slx1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9ee2be15-2067-4caf-b02b-aca6b46118c5",
+ "metadata": {},
+ "source": [
+ "Relative to the classic regression model, the fit improves slightly, but the constant, **EP_NOHSDP** and **HIS_CT** become non-significant at p = 0.01 (they are marginally signifcant at p=0.05). All but one coefficient of the SLX terms are significant (**W_EP_NOVEH** is not). The signs and magnitudes of the SLX coefficients relative to their unlagged counterparts remain a bit confusing. Only for **EP_LIMENG** and **W_EP_LIMENG** are they the same, with the lag coefficient smaller than the unlagged one, in accordance with Tobler's law. The coefficient for **W_HIS_ct** is significant and larger than that of **HIS_ct**, while the latter is not significant at p = 0.01. In other words, the interpretation of these results in terms of distance decay and Tobler's law may be a bit problematic.\n",
+ "\n",
+ "In terms of diagnostics, there is a slight problem with multicollinearity (often the case in SLX specifications), strong non-normality and evidence of heteroskedasticity. Moran's I is significant, as are both LM-tests, but neither of the robust forms is significant. Based on the relative magnitudes of the test statistics, there is a slight indication of a possible Lag alternative, i.e., a spatial Durbin specification. However, this indication is not as strong as that provided by the LM-SDM test statistics in the classic specification."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8a0c186-dd30-4f64-9a5b-d95bb562ebde",
+ "metadata": {},
+ "source": [
+ "## IV Estimation of the Lag Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cc74112-a2eb-4efa-9116-d1e5197d20ed",
+ "metadata": {},
+ "source": [
+ "### Principle"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0385d805-bfb4-4e79-b0ed-0fc59c811926",
+ "metadata": {},
+ "source": [
+ "The lag model $y = \\rho Wy + X\\beta + u$ can also be written as $y = Z\\delta + u$, where $Z = [ X, Wy ]$ and the coefficient vector is rearranged as $\\delta = [\\beta, \\rho ]$. This setup is the same as that considered in the treatment of endogeneity by means of 2SLS estimation.\n",
+ "\n",
+ "The spatially lagged dependent variable is indeed endogenous. Based on the reduced form, \n",
+ "$$y = (I - \\rho W)^{-1} X\\beta + (I - \\rho W)^{-1}u,$$ \n",
+ "$Wy$ follows as:\n",
+ "$$Wy = W(I - \\rho W)^{-1} X\\beta + W(I - \\rho W)^{-1}u.$$\n",
+ "As a result, after some algebra, $E[(Wy)'u] = tr[W(I - \\rho W)'^{-1}].E[uu'] \\neq 0$. This follows because of the presence of non-zero diagonal terms in the inverse matrix (see the discussion of impacts in the maximum likelihood notebook). Consequently, one of the fundamental assumptions of OLS estimation is violated, and Wy must be treated as an endogenous variable.\n",
+ "\n",
+ "This is carried out by means of 2SLS estimation, whereby the main question becomes the choice of good (optimal) instruments for Wy. The conditional expectation of Wy, given X, provides the basis for this:\n",
+ "$$E[Wy | X] = W(I - \\rho W)^{-1} X\\beta = W(I + \\rho W + \\rho^2 W + \\dots) X \\beta.$$\n",
+ "As a result, the spatially lagged explanatory variables $WX, W^2X, \\dots$ suggest themselves as instruments. Since the $X$ are assumed to be uncorrelated with the error terms, so are the $WX$. Also, from the conditional expectation, it follows that they are related to the endogenous Wy, which satisfies a second requirement for instruments. Note that WX is *not* applied to the constant term, since this would result in perfect multicollinearity.\n",
+ "\n",
+ "Estimation then follows as a straightforward application of 2SLS, referred to as *spatial* 2SLS, or S2SLS. The estimator, variance-covariance matrix, and robust standard errors are the same as for the classis 2SLS (see the 2SLS notebook for details).\n",
+ "\n",
+ "In a straightforward manner, additional endogenous variables with associated instruments can be incorporated as well. One question remains whether the additional instruments should be lagged or not. In `spreg.GM_Lag`, this is handled by the `lag_q` option (the default is `lag_q = True`)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1bfd7648-bdd1-4ebb-bcf5-4dc74782693d",
+ "metadata": {},
+ "source": [
+ "### Implementation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4ada2a0d-81fc-4b07-8ba9-d7e1bc101b1a",
+ "metadata": {},
+ "source": [
+ "IV estimation of the spatial lag model is carried out by means of `spreg.GM_Lag`. This is a customized implementation of the two stage least squares estimation, with the instruments for the spatially lagged dependent variable computed internally. As mentioned, additional endogenous variables can be specified as well.\n",
+ "\n",
+ "The default setup requires the dependent variable, `y`, the explanatory variables (without a constant term), `x`, and the spatial weights `w`. The instruments are the spatially lagged explanatory variables, WX. They do not need to be specified separately. The order of spatial weights used as instruments is set by means of `w_lags` (the default is `w_lags = 1`).\n",
+ "\n",
+ "As is customary, the main results are listed using the `summary` method. For now, no impacts are listed by setting `spat_impacts = None` (note that `spat_impacts = \"simple\"` is the default). The AK-test for remaining residual spatial autocorrelation is included by default (i.e., `spat_diag = True`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b8887e6b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag1 = GM_Lag(y,x,w=wq,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts = None)\n",
+ "print(lag1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8ee6e4e6",
+ "metadata": {},
+ "source": [
+ "In this example, the estimates are very similar to the ML results (see the ML notebook). This is not always the case, but it is encouraging when it is (in the sense of not having other misspecification issues). The autoregressive coefficient of 0.377 compares to 0.392 in the ML case and is highly significant. As in the ML case, the coefficients of the constant term and of **EP_NOHSDP** become non-significant. The other regression coefficients are of similar magnitudes as in the ML case and thus also generally smaller than the corresponding OLS estimates.\n",
+ "\n",
+ "The results show the **Instrumented** variable as **W_EP_UNINSUR** and the **Instruments** as the spatial lags of the explanatory variables.\n",
+ "\n",
+ "As in the ML case, there are two measures of fit, the **Pseudo R-squared** (based on the *naive* residuals) and the **Spatial Pseudo R-squared** (based on the predicted values from the reduced form). The results are essentially the same as for the ML estimates, e.g., respectively 0.684 (vs. 0.685) and 0.647 (vs. 0.647). In constrast to the ML results, there is no Likelihood, AIC or SC.\n",
+ "\n",
+ "Note that with S2SLS estimation, it is possible for the spatial autoregressive parameter to take on a value larger than one. Unlike maximum likelihood estimation, where the parameter space is constrained in the optimization routine, there is no such constraint in S2SLS. When the parameter estimate is outside the bounds, a warning is given and any properties based on the reduced form predicted values will not be computed. This includes **predy_e** and **e_pred**, as well as the **Spatial Pseudo R-squared** and the spatial impacts (for an example, see below).\n",
+ "\n",
+ "The AK test shows no evidence of remaining residual spatial autocorrelation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a14ac009",
+ "metadata": {},
+ "source": [
+ "### Instrument lag order"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f1aea751",
+ "metadata": {},
+ "source": [
+ "The order of the spatial lags used for the instruments is determined by the `w_lags` argument. By default, this is set to 1. The use of higher order lags results in greater precision, but often at the cost of an increase in multicollinearity. In practice, using lags larger than 2 is not recommended.\n",
+ "\n",
+ "The effect of setting `w_lags = 2` is illustrated next."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc994370",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag2 = GM_Lag(y,x,w=wq,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " w_lags = 2,\n",
+ " spat_impacts = None)\n",
+ "print(lag2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fdb8126b",
+ "metadata": {},
+ "source": [
+ "The additional instruments are listed below the parameter estimates with the prefix **W2**. In this example, the effect of including the second order lags is minimal. The autoregressive coefficient becomes 0.381 (vs. 0.377) and the standard errors are marginally smaller. The fit is unchanged."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f747b04",
+ "metadata": {},
+ "source": [
+ "### Predicted values and residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1306bd65",
+ "metadata": {},
+ "source": [
+ "The treatment of predicted values and residuals is the same as for maximum likelihood estimation. The *naive* results are stored in the regression object attributes **predy** and **u**, and the values that are based on the reduced form prediction are in **predy_e** and **e_pred**."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b8a6d92f",
+ "metadata": {},
+ "source": [
+ "## Robust Standard Errors"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6a1bb5e8",
+ "metadata": {},
+ "source": [
+ "As in the standard 2SLS estimation, it is possible to obtain robust standard errors by means of the `robust = \"white\"` and `robust = \"hac\"` options. Given the common presence of heteroskedastic errors in cross-sectional regression, this is highly recommended. The results are given below. As before, the `hac` option requires that a kernel spatial weights object is specified as the argument `gwk`, with, optionally, `name_gwk`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e17afc1",
+ "metadata": {},
+ "source": [
+ "### White standard errors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "791f7485",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag3 = GM_Lag(y,x,w=wq,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " robust = 'white',\n",
+ " spat_impacts = None)\n",
+ "print(lag3.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "61ac8385",
+ "metadata": {},
+ "source": [
+ "The estimates are the same as before, but the standard errors are slightly larger. For example, for the spatial autoregressive coefficient, the standard error becomes 0.07377, compared to 0.06455. The overall impact is minimal."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a174f964",
+ "metadata": {},
+ "source": [
+ "### HAC standard errors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b422888e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag4 = GM_Lag(y,x,w=wq,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " robust = 'hac',gwk=wk,name_gwk=wk_name,\n",
+ " spat_impacts = None)\n",
+ "print(lag4.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7697c10",
+ "metadata": {},
+ "source": [
+ "In this case, the HAC standard errors are very similar to the heteroskedasticity-robust ones, suggesting the main source of misspecification comes from the latter. This is in accordance with the lack of significance of the AK-test. The standard error for the spatial autoregressive coefficient, 0.07238, is even slightly smaller than the White standard error (0.07337)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a9f084bf-276f-44a7-863b-41835831d7ee",
+ "metadata": {},
+ "source": [
+ "## Spatial Multipliers - Impacts"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b6ed0760-3ba2-4f0e-a06d-a295c13e29b4",
+ "metadata": {},
+ "source": [
+ "Similar to what holds for ML estimation, impact measures are computed for spatial lag models estimated by means of instrumental variables. This is implemented through the `spat_impacts` argument. The default setting is `spat_impacts = \"simple\"` for the Kim et al approach (see the ML notebook for technical details). As before, the other options are `full`, `power`, `all` or ``None`.\n",
+ "\n",
+ "In the example below, the argument is set as `spat_impacts = [\"simple\",\"full\"]`. Only classic standard errors are considered. Since the multipliers and impacts only depend on the coefficient estimates, the type of standard error is immaterial.\n",
+ "\n",
+ "Note that the reported impacts are only *average* effects. See the spatial multipliers notebook for a more extensive analysis of the associated spatial pattern."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4cb376e5-d8b2-4aaf-aad9-e3c659613419",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag5 = GM_Lag(y,x,w=wq,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts=['simple','full'])\n",
+ "print(lag5.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4b392f8",
+ "metadata": {},
+ "source": [
+ "The interpretation of the direct, indirect and total effects is the same as before. Since the coefficient estimates were very similar to the results obtained for ML, the impacts are similar as well."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9f83c898",
+ "metadata": {},
+ "source": [
+ "## Additional Endogenous Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c5ec68eb",
+ "metadata": {},
+ "source": [
+ "Additional endogenous variables are included by means of the **yend** and **q** arguments, in the same way as for classic 2SLS estimation. The only relevant option in this regard is whether the instruments should be lagged as well. The default is to include their spatial lags, through `lag_q = True`. In practice, there is no good reason not to lag them, since they provide additional information. For clarity, the `lag_q` argument is included, even though it is not needed, since it is the default.\n",
+ "\n",
+ "Note that here `x = xe`, which contains only the exogenous variables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4b87f919",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag6 = GM_Lag(y,x=xe,w=wq,yend=yend,q=q,\n",
+ " lag_q = True,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts=['simple','full'])\n",
+ "print(lag6.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c0d983e4",
+ "metadata": {},
+ "source": [
+ "The output listing now includes **HIS_ct** as **Instrumented**, and both **COORD_X** and **COORD_Y**, as well as their spatial lags among the **Instruments**.\n",
+ "\n",
+ "The impact on the estimates is relatively minor, although the spatial autoregressive coefficient decreases slightly to 0.32995 (compared to 0.37698). The treatment of **HIS_ct** as endogenous results in its coefficient becoming only marginally significant (p=0.02). The model impacts reflect the slightly different coefficient estimates. \n",
+ "\n",
+ "Note that impacts are only computed for the exogenous variables. In a strict sense, the additional endogenous variables should not be part of the reduced form. Moreover, since they are not determined in a fully simultaneous equation system, there is no practical way to include other exogenous variables from such a system. Therefore, the multiplier effect of additional endogenous variables is not considered."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "708ffa42-763f-4ee2-acf0-2ab079795cfd",
+ "metadata": {},
+ "source": [
+ "## IV Estimation of Spatial Durbin Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5a27b8d3-13cb-4e6d-aef2-549f19c6dc6f",
+ "metadata": {},
+ "source": [
+ "IV estimation of the Spatial Durbin model is a special case of `spreg.GM_Lag`, with the additional argument of `slx_lags=1` (or a larger value). Everything else remains the same. \n",
+ "\n",
+ "In the example, `spat_impacts` is set to `[\"simple\",\"full\"]` (note, the default setting remains `spat_impacts = \"simple\"`).\n",
+ "\n",
+ "Another default setting is `spat_diag = True`, which yields the results for the both the AK test and the Common Factor Hypothesis test. To avoid these tests, `spat_diag` must be set to `False`. In the illustration, both arguments are listed explicitly for clarity."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d5bc8e4-3683-4bfd-a904-f944cc69a71b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spdur1 = GM_Lag(y,x,w=wq,slx_lags=1,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts = ['simple','full'],\n",
+ " spat_diag = True)\n",
+ "print(spdur1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73b650f7-2272-4f1a-8d2b-e6c007f45d3a",
+ "metadata": {},
+ "source": [
+ "Note how the list of **Instruments** now consists of the second order spatial lags, even though `w_lags = 1`. This is because the spatial Durbin model already includes the first order lags as explanatory variables. Using first order lags as instruments would result in perfect multicollinearity. This is detected internally in `GM_Lag`, so that the computation of the lagged instruments is adjusted accordingly.\n",
+ "\n",
+ "As in the case of ML estimation, the inclusion of the lagged explanatory variables has quite an effect, even though only **W_EP_NOHSDP** turns out to be significant (for ML, **W_HIS_ct** was significant as well). Also, the spatial autoregressive coefficient (0.509) is no longer significant. Three of the four lag coefficients (including the non-significant ones) have a negative sign, the opposite of the original regression coefficients. This raises the suspicion that the proper specification may be a spatial error model. The results of the Common Factory Hypothesis Test bear this out. At 4.083, it is *not* significant, which means that the common factor constraint can *not* be rejected. This contrasts greatly with the result for ML estimation, where the constraint was strongly rejected. However, given the lack of significance of the spatial lag terms, this needs to be interpreted with caution. A better strategy would be to consider further refinements of the model specification by eliminating some lag terms by means of `slx_vars`, as in the SLX model. This is not further pursued here.\n",
+ "\n",
+ "The impacts are computed as before. Also, robust standard errors can be implemented in the usual fashion."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b180e95a",
+ "metadata": {},
+ "source": [
+ "### Spatial Durbin model with endogenous variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4a66754f",
+ "metadata": {},
+ "source": [
+ "Additional endogenous variables can be included in a spatial Durbin specification in the same way as in the standard spatial lag model. With `spat_diag = True`, the default, only the AK test is produced. Since there is no spatial lag for the endogenous variable, a common factor test is not meaningful.\n",
+ "\n",
+ "The example below uses the same specification as in the spatial lag model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "46e70ee3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spdur2 = GM_Lag(y,x=xe,w=wq,yend=yend,q=q,\n",
+ " lag_q = True,\n",
+ " slx_lags=1,\n",
+ " name_w=w_name,name_ds=ds_name,\n",
+ " spat_impacts=['simple','full'],\n",
+ " spat_diag = True)\n",
+ "print(spdur2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8d0d987f",
+ "metadata": {},
+ "source": [
+ "In this example, the estimate for the spatial autoregressive coefficient turns out to be larger than one (1.01654), which is outside the accepted parameter space. This raises a warning and precludes the computation of the spatial pseudo R-squared and the model impacts.\n",
+ "\n",
+ "When this happens, further refinement of the model and/or estimation is required. There are several options, such as using higher lags for the instruments, using different/more instruments for the additional endogenous variable, dropping some lag terms (using `slx_vars`), or changing the original specification altogether. This is left as an exercise."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b086a22",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "IV estimation of the spatial lag model is sensitive to several aspects of the model specification. However, in large(r) samples, it generally yields more robust results than maximum likelihood, especially when using robust standard errors.\n",
+ "\n",
+ "As practice, different model specifications could be considered, including adding additional explanatory variables, selectively removing some lag terms, and using different spatial weights. Make sure to carefully consider the interpretation of the estimated coefficients and associated direct and indirect effects, as well as the robustness of the standard errors."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/15_ML_estimation_spatial_error.ipynb.txt b/_sources/notebooks/15_ML_estimation_spatial_error.ipynb.txt
new file mode 100644
index 00000000..80239f32
--- /dev/null
+++ b/_sources/notebooks/15_ML_estimation_spatial_error.ipynb.txt
@@ -0,0 +1,723 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "89145993",
+ "metadata": {},
+ "source": [
+ "# Maximum Likelihood Estimation - Spatial Error Model\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/21/2024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a85427f9",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "Similar to the treatment of the spatial lag model, the estimation of the spatial error model is covered in two notebooks. This first one covers Maximum Likelihood estimation. General Method of Moments (GMM) estimation is considered in a separate notebook.\n",
+ "\n",
+ "As mentioned in the spatial lag notebook, it should be kept in mind that the maximum likelihood estimation in `spreg` is primarily included for pedagogical purposes. Generally, the GMM approach is preferred. In addition, an optimal maximum likelihood estimation implementation, based on the Smirnov-Anselin (2001) approximation, is not currently implemented in `spreg`. It is implemented in C++ in `GeoDa`. This is the preferred approach for ML estimation in large(r) data sets.\n",
+ "\n",
+ "The `spreg` module implements ML estimation of the spatial error model in the `ML_Error` class. The estimation of the SLX-Error model is implemented through the inclusion of the `slx_lags` argument. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9a2c9eb",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "As before, the main module is *spreg* for spatial regression analysis. From this, `OLS` and `ML_Error` are imported. In addition, the utilities in *libpysal* (to open spatial weights and access the sample data set), *pandas* and *geopandas* are needed, as well as *time* (for some timing results), *matplotlib.pyplot* and *seaborn* for visualization. All of these rely on *numpy* as a dependency. Finally, in order to carry out the Likelihood Ratio tests, `likratiotest` is imported from `spreg.diagnostics`.\n",
+ "\n",
+ "The usual *numpy* `set_printoptions` is included as well."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "5ac490b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "import time\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from libpysal.io import open\n",
+ "from libpysal.examples import get_path\n",
+ "from libpysal.weights import lag_spatial\n",
+ "\n",
+ "from spreg import OLS, ML_Error\n",
+ "from spreg.diagnostics import likratiotest\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0ee18820",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - read_file\n",
+ " - DataFrame\n",
+ " - head\n",
+ " - describe\n",
+ " \n",
+ "- from libpysal:\n",
+ " - io.open\n",
+ " - examples.get_path\n",
+ " - weights.lag_spatial\n",
+ " \n",
+ "- from numpy:\n",
+ " - hstack\n",
+ "\n",
+ "- from matplotlib/seaborn:\n",
+ " - regplot\n",
+ " - show\n",
+ "\n",
+ "- from spreg:\n",
+ " - spreg.OLS\n",
+ " - spreg.ML_Error\n",
+ " - spreg.diagnostics.likratiotest"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "47934d1b-7587-4ddf-be38-83904eede8e8",
+ "metadata": {},
+ "source": [
+ "### Variable definition and data input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfb04af4-4521-4ac0-8e55-d3b92f54a403",
+ "metadata": {},
+ "source": [
+ "The data set and spatial weights are from the **chicagoSDOH** sample data set:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic indicators of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_q.gal**: queen contiguity weights\n",
+ "\n",
+ "To illustrate the methods, the same descriptive model is used that relates the rate of uninsured households in a tract(for health insurance, **EP_UNINSUR**) to the lack of high school education (**EP_NOHSDP**), the economic deprivation index (**HIS_ct**), limited command of English (**EP_LIMENG**) and the lack of access to a vehicle (**EP_NOVEH**). This is purely illustrative of a spatial error specification and does not have a particular theoretical or policy motivation.\n",
+ "\n",
+ "The file names and variable names are set in the usual manner. Any customization for different data sets/weights and different variables should be specified in this top cell."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "14a1e98e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "infileq = get_path(\"Chi-SDOH_q.gal\") # queen contiguity weights created with GeoDa\n",
+ "\n",
+ "y_name = 'EP_UNINSUR'\n",
+ "x_names = ['EP_NOHSDP','HIS_ct','EP_LIMENG','EP_NOVEH']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "w_name = 'Chi-SDOH_q'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9514984-a22e-4d5b-a993-f40771a290a0",
+ "metadata": {},
+ "source": [
+ "The `read_file` and `open` functions are used to access the sample data set and contiguity weights. The weights are row-standardized and the data frames for the dependent and explanatory variables are constructed. As before, this functionality is agnostic to the actual data sets and variables used, since it relies on the specification given in the initial block above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "3248d23d-2247-40bf-a3a5-3ba21b477aa8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "wq = open(infileq).read() \n",
+ "wq.transform = 'r' # row-transform the weights\n",
+ "y = dfs[y_name]\n",
+ "x = dfs[x_names]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2aea09bf",
+ "metadata": {},
+ "source": [
+ "## OLS and SLX with Spatial Diagnostics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb5eb774-814b-467e-ad51-11d21d3cee50",
+ "metadata": {},
+ "source": [
+ "For ease of reference, standard OLS and SLX regressions with spatial diagnostics included in this notebook as well. These results are identical to the ones provided in the spatial lag notebooks."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fdb2b8ce-6ae8-4a15-a6c7-7ec23700dfc7",
+ "metadata": {},
+ "source": [
+ "### OLS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "379724ce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1 = OLS(y,x,w=wq,spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(ols1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d0e0f74-5877-4497-8e7b-79394b6701fd",
+ "metadata": {},
+ "source": [
+ "The specification achieves an acceptable $R^2$ of about 0.63 and all coefficients are positive and highly significant.\n",
+ "\n",
+ "The non-spatial diagnostics suggest non-normality as well as a hight degree of heteroskedasticity. There is no problem with multicollinearity.\n",
+ "\n",
+ "The spatial diagnostics against the SARERR alternatives show very significant LM-Lag and LM-Error, but of the two robust tests, only RLM-Lag is highly significant (RLM-Error only at p < 0.03). Hence, there is a strong indication that a Lag rather than an Error alternative may be appropriate. While the joint LM test is also highly significant, this is likely due to a strong one-sided (Lag) alternative.\n",
+ "\n",
+ "Interestingly, the diagnostics against a spatial Durbin alternative strongly support the latter as well. Both LM tests and their robust forms are highly significant, and so is the joint test. Moreover, the value for the robust forms of the test is smaller than the original, which is the expected behavior (although not always reflected in empirical practice).\n",
+ "\n",
+ "In sum, in addition to a spatial Lag model as an alternative, the spatial Durbin specification deserves consideration as well."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0cfc5d4e-a1b3-4b07-8120-2def922d8ef6",
+ "metadata": {},
+ "source": [
+ "### SLX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "53f9b90d-7f6d-4b3e-9e99-83430957fe3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slx1 = OLS(y,x,w=wq,slx_lags=1,spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(slx1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9ee2be15-2067-4caf-b02b-aca6b46118c5",
+ "metadata": {},
+ "source": [
+ "Relative to the classic regression model, the fit improves slightly, but the constant, **EP_NOHSDP** and **HIS_CT** become non-significant at p = 0.01 (they are marginally signifcant at p=0.05). All but one coefficient of the SLX terms are significant (**W_EP_NOVEH** is not). The signs and magnitudes of the SLX coefficients relative to their unlagged counterparts remain a bit confusing. Only for **EP_LIMENG** and **W_EP_LIMENG** are they the same, with the lag coefficient smaller than the unlagged one, in accordance with Tobler's law. The coefficient for **W_HIS_ct** is significant and larger than that of **HIS_ct**, while the latter is not significant at p = 0.01. In other words, the interpretation of these results in terms of distance decay and Tobler's law may be a bit problematic.\n",
+ "\n",
+ "In terms of diagnostics, there is a slight problem with multicollinearity (often the case in SLX specifications), strong non-normality and evidence of heteroskedasticity. Moran's I is significant, as are both LM-tests, but neither of the robust forms is significant. Based on the relative magnitudes of the test statistics, there is a slight indication of a possible Lag alternative, i.e., a spatial Durbin specification. However, this indication is not as strong as that provided by the LM-SDM test statistics in the classic specification."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8a0c186-dd30-4f64-9a5b-d95bb562ebde",
+ "metadata": {},
+ "source": [
+ "## ML Estimation of the Error Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cc74112-a2eb-4efa-9116-d1e5197d20ed",
+ "metadata": {},
+ "source": [
+ "### Principle"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0385d805-bfb4-4e79-b0ed-0fc59c811926",
+ "metadata": {},
+ "source": [
+ "The spatial Error model is a regular linear regression model with spatially autoregressive error terms:\n",
+ "\n",
+ "$$y = X\\beta + u, u = \\lambda W + e,$$\n",
+ "\n",
+ "where $\\lambda$ is the spatial autoregressive (error) parameter\n",
+ "\n",
+ "The point of departure of the Maximum Likelihood estimation of this model is again the assumption of joint normality of the error terms. However, the error terms are no longer\n",
+ "independent, but they have a covariance matrix that follows from the \n",
+ "spatial autoregressive specification. Specifically:\n",
+ "\n",
+ "$$E[uu'] = \\Sigma = \\sigma^2 [(I - \\lambda W)'(I - \\lambda W)]^{-1},$$\n",
+ "\n",
+ "where $\\sigma^2$ is the variance of the remaining error terms. \n",
+ "\n",
+ "This leads to the log-likelihood function:\n",
+ "\n",
+ "$$\\ln L = -(n/2)(\\ln 2\\pi) - (n/2) \\ln \\sigma^2 + \\ln | I - \\lambda W | \\\\\n",
+ " - \\frac{1}{2\\sigma^2} (y - X \\beta)'(I - \\lambda W)'(I - \\lambda W)(y - X \\beta).$$\n",
+ "\n",
+ "The last term in this expression is a sum of squared residuals in a \n",
+ "regression of $(I - \\lambda W)y$ on $(I - \\lambda W)X$, i.e., a standard OLS estimation, but\n",
+ "based on the\n",
+ "spatially filtered dependent and explanatory variables (of course, this assumes a value for $\\lambda$).\n",
+ "The regression of the spatially filtered variables is referred to as *spatially weighted least squares* or *spatial Cochran-Orcutt*, the latter due to its similarity to a familiar time series transformation.\n",
+ "\n",
+ "As in ths spatial lag case, maximization of the log-likelihood is simplified since a *concentrated* likelihood can be derived that is only a function of the single parameter $\\lambda$. Once an estimate for $\\lambda$ is obtained, the corresponding estimates for $\\beta$ and $\\sigma^2$ are easily computed. For technical details, see Chapter 10 of Anselin and Rey (2014).\n",
+ "\n",
+ "Inference is again based on an asymptotic variance matrix."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1bfd7648-bdd1-4ebb-bcf5-4dc74782693d",
+ "metadata": {},
+ "source": [
+ "### Implementation methods"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4ada2a0d-81fc-4b07-8ba9-d7e1bc101b1a",
+ "metadata": {},
+ "source": [
+ "ML estimation of the spatial error model works in the same way as for the lag model. It is implemented by means of `spreg.ML_Error`, with all the standard regression arguments (i.e., at a minimum, **y**, **x** and **w**). Again, three different methods are implemented: `full`, `ord` an `LU`. These differ only in the way the Jacobian term $\\ln | I - \\lambda W |$ is computed. \n",
+ "\n",
+ "As in the lag case, the default optimization method is *brute force*, or `method=\"full\"`. The other options are the Ord eigenvalue method, `method = \"ord\"`, and the LU matrix decomposition, `method = \"LU\"`. Again, the latter is the only reliable method for larger data sets."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "75672c3c-9170-4135-bdaf-2a335eec8090",
+ "metadata": {},
+ "source": [
+ "The ML estimation is illustrated for the same specification as before, first using `method=\"full\"`. Since this is also the default, it is not necessary to explicitly include this argument, but it is listed here for clarity. To compare the relative speed of the different methods, `time.time()` is used.\n",
+ "\n",
+ "Unlike what holds for the spatial lag model, there are no impacts for the spatial error model, since any spillover effects are limited to the error terms. Since the model impacts are based on averages (conditional expectation of y given X), the error terms are immaterial (on average, they are zero)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71307ef9-cc53-46a6-866e-5bd521e4503e",
+ "metadata": {},
+ "source": [
+ "#### Method `full`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1975d6b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "err1 = ML_Error(y,x,w=wq,method=\"full\",\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "t1 = time.time()\n",
+ "print(\"Time in seconds: \",t1-t0)\n",
+ "print(err1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ec92847a-2982-4d67-a514-da927b0fd7f6",
+ "metadata": {},
+ "source": [
+ "Even though the LM tests provided only weak evidence of an error alternative, the estimate of $\\lambda$, 0.451, is highly significant. The other regression coefficients change slightly relative to the OLS estimates, but not nearly as much as was the case for the spatial lag model. In fact, the estimates should *not* change much, since OLS remains unbiased (but becomes inefficient) in the presence of spatial error autocorrelation.\n",
+ "\n",
+ "The measures of fit include a pseudo $R^2$, 0.633 (squared correlation between observed and predicted y), the log-likelihood and the AIC and SC information criteria. Since the lag and error models have the same number of parameters, their log-likelihoods are directly comparable. In the lag model, the result was -2418.99, here it is -2428.85. In other words, the log-likelihood in the lag model is somewhat *larger* than for the error model, confirming the indication given by the LM test statistics (in favor of the lag model).\n",
+ "\n",
+ "As before, important attributes of the results are stored in the regression object. These include the regression coefficients, in **betas**, with the spatial autoregressive coefficient as the last element. The latter is also included separately as **lam**. The standard errors are in **std_err**, z-statistics and p-values in **z_stat**, and the complete variance-covariance matrix is **vm**.\n",
+ "\n",
+ "Since there is no reduced form for the error model, there is only one type of predicted value, contained in **predy**. However, there are two types of residuals, the classic residual, $u = y - X \\hat{\\beta}$, stored in **u**, and the spatially filtered residuals, $u - \\lambda W u$, stored in **e_filtered**. The estimate for the error variance, $\\sigma^2$, is based on the latter."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9866e7c6-4fc9-4cf4-af2f-7587b6c2a490",
+ "metadata": {},
+ "source": [
+ "The contents of the **betas** and **lam** attributes show how the estimate for $\\lambda$ is also the last element in **betas**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "386e6b44-9f51-4572-9ea1-7fa98f7beb4f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "print(\"betas \",err1.betas)\n",
+ "print(f\"lambda: {err1.lam:0.3f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2d3b5229-f169-4d15-aafe-416f325183a5",
+ "metadata": {},
+ "source": [
+ "#### Method `ord`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6ba06a23-111a-430e-bd60-ce54ee60a660",
+ "metadata": {},
+ "source": [
+ "The Ord eigenvalue method is invoked by means of `method=\"ord\"`. All other attributes are the same as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e1ec4ab7-5eed-4c14-b046-2f9efff13642",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "err2 = ML_Error(y,x,w=wq,method=\"ord\",\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "t1 = time.time()\n",
+ "print(\"Time in seconds: \",t1-t0)\n",
+ "print(err2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff1ea981-669d-4ba0-a200-6f41c17717f0",
+ "metadata": {},
+ "source": [
+ "The coefficient estimates are identical to those obtained with the `full` method. There are some slight differences in the computed standard errors (and thus also in the z-values and p-values), but the overall effect is minimal. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0439c7c-771a-446f-8677-51570d7f8dea",
+ "metadata": {},
+ "source": [
+ "#### Method `LU`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "846b5ac1-e4b0-4b99-8a56-7663403935d1",
+ "metadata": {},
+ "source": [
+ "Again, all arguments are the same, except for `method = \"LU\"`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e485647-951e-4b62-80be-b1bb505b5027",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "err3 = ML_Error(y,x,w=wq,method=\"LU\",\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "t1 = time.time()\n",
+ "print(\"Time in seconds: \",t1-t0)\n",
+ "print(err3.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34682841-1194-4f32-a14d-1aea4ce6b9e3",
+ "metadata": {},
+ "source": [
+ "In this case, the estimation results are identical to those for the `full` method."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e768c5f-9216-48f7-ac1f-f25c2d7e0a43",
+ "metadata": {},
+ "source": [
+ "## Predicted Values and Residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1c4af1a9-aba6-4b50-b4f1-f45694e634c4",
+ "metadata": {},
+ "source": [
+ "The two types of residuals can be readily turned into a data frame by means of `pd.DataFrame` applied to an array constructed with `np.hstack`, in the same way as was done for the OLS predicted values and residuals. In the example, the associated variable names are **resid** and **filtered**, passed as the `columns` argument.\n",
+ "\n",
+ "Descriptive statistics are obtained with `describe()`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "343c44c2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preds = pd.DataFrame(np.hstack((err1.u,err1.e_filtered)),columns=['resid','filtered'])\n",
+ "preds.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9bd86573-c074-4aec-9d51-13314ca79451",
+ "metadata": {},
+ "source": [
+ "Note some important differences between the two concepts. The filtered residuals are the proper estimates for the regression error term e (not u). Clearly, it has a mean of zero, which is not quite the case for the unfiltered residuals. Also, the variance for the filtered residual is slightly smaller, but the range is very much smaller."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "18400a28-a42c-46ad-9657-f17932d67a68",
+ "metadata": {},
+ "source": [
+ "The correlation between the two concepts is high, but not perfect, at 0.968."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "44974590-467f-49ed-beec-9725184c233e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Correlation between residuals: {preds['resid'].corr(preds['filtered']):0.3f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "946ad797-705e-4782-9a46-f0865292e917",
+ "metadata": {},
+ "source": [
+ "#### Spatial pattern of residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "20a5a87f-e2b2-4149-902c-7a2560b6da56",
+ "metadata": {},
+ "source": [
+ "A final interesting comparison is between the spatial pattern of the two types of residuals. To assess this, a simple Moran scatterplot is constructed, where the spatial lag is computed by means of `libpysal.lag_spatial`. The plot itself is constructed with `sns.regplot`, which superimposes a regression line on the scatter plot of the spatial lag on the original variable. No customization of the graph is carried out.\n",
+ "\n",
+ "For the *naive* residuals, this yields the following plot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0ecebb8e-d396-4e1c-8092-b6db612aebee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "werr = lag_spatial(wq,preds['resid']).reshape(-1,1)\n",
+ "sns.regplot(x=preds['resid'],y=werr)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd782036-bd82-45d5-90c7-2d527d46fd61",
+ "metadata": {},
+ "source": [
+ "The regression line shows a strong positive slope, suggesting remaining spatial clustering."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "245082da-2703-4282-bf5b-24c58f83edac",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wfor = lag_spatial(wq,preds['filtered']).reshape(-1,1)\n",
+ "sns.regplot(x=preds['filtered'],y=wfor)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "12ae004f-e7cc-47a1-85cc-6426152ff8d3",
+ "metadata": {},
+ "source": [
+ "In contrast, the slope for the filtered residuals is essentially flat, suggesting that the spatial autocorrelation has been removed."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ed8bd5bc-4921-40f3-96c3-79dfd68ed92d",
+ "metadata": {},
+ "source": [
+ "#### Mapping predicted values and residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "54225ac9-bdc4-4560-a62f-c2761b8ed963",
+ "metadata": {},
+ "source": [
+ "Optionally, the predicted values and residuals can be added to the spatial data frame in order to construct associated maps. However, since these maps create only visual impressions of spatial patterning, this is not further pursued here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "708ffa42-763f-4ee2-acf0-2ab079795cfd",
+ "metadata": {},
+ "source": [
+ "## ML Estimation of the SLX-Error Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5a27b8d3-13cb-4e6d-aef2-549f19c6dc6f",
+ "metadata": {},
+ "source": [
+ "ML estimation of the SLX-Error model is a special case of `spreg.ML_Error`, with the additional argument of `slx_lags=1` (or a larger value). Everything else remains the same. More specifically, the three methods of `full`, `ord` and `LU` are again available. Only the default `full` is considered here. The results are essentially the same for the other methods. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d5bc8e4-3683-4bfd-a904-f944cc69a71b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "slxerr = ML_Error(y,x,w=wq,slx_lags=1,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "t1 = time.time()\n",
+ "print(\"Time in seconds: \",t1-t0)\n",
+ "print(slxerr.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73b650f7-2272-4f1a-8d2b-e6c007f45d3a",
+ "metadata": {},
+ "source": [
+ "Compared to the OLS SLX estimates, there are some minor changes, but much less so than for the spatial Durbin model. For example, **W_EP_NOHSDP** becomes marginally non-significant (p=0.02), whereas it was significant in OLS. This is the only lag coefficient where the sign differs from that of the original coefficient, which was also the case in OLS. In terms of fit, there is a slight improvement, from an AIC of 4903.5 in OLS to 4841.6 here (smaller is better).\n",
+ "\n",
+ "The spatial autoregressive coefficient, 0.400, is highly significant.\n",
+ "\n",
+ "As in the lag case, further refinements of the model specification can be carried out by eliminating some lag terms by means of `slx_vars`, as in the standard SLX model. This is not further pursued here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9f5f5846-51fa-4ad4-985f-c11a68d2f380",
+ "metadata": {},
+ "source": [
+ "### Likelihood-Ratio Tests"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab69fd57-726f-4f4d-8bd2-2273235feaa9",
+ "metadata": {},
+ "source": [
+ "A likelihood ratio test is $LR = 2.0 \\times (LogL_1 - LogL_0)$, where $LogL_1$ is the log-likelihood for the *unrestricted* model (i.e., with more non-zero parameters), and $LogL_0$ is the log-likelihood for the *restricted* model (i.e., where some parameters, like $\\rho$, are set to zero). For example, a likelihood ratio test on the coefficient $\\rho$ in the spatial lag model would use the log likelihood in the spatial lag model as $LogL_1$, and the log-likelihood from the classic regression as $LogL_0$. \n",
+ "\n",
+ "The $LR$ statistic is distributed as a Chi-square random variable with degrees of freedom equal to the number of restrictions, i.e., 1 for the spatial autoregressive coefficient, but more for the SLX and spatial Durbin models, depending on how many explanatory variables are included. The LR tests are an alternative to the Wald tests (asymptotic t-values) on the spatial coefficient and the LM tests for spatial effects considered earlier.\n",
+ "\n",
+ "The same likelihood ratio test as in the lag model can be implemented with `spreg.diagnostics.likratiotest`. Its two arguments are the regression object for the constrained model and the regression object for the unconstrained model. The result is a dictionary with the statistic (`likr`), the degrees of freedom (`df`) and the p-value (`p-value`)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc1e3f8e-4901-4c8e-8cf9-5e41460d40fa",
+ "metadata": {},
+ "source": [
+ "Four different LR test consider the following constraints:\n",
+ "- Error vs OLS, i.e., $\\lambda = 0$ in the Error model: arguments are **ols1** and **err1**\n",
+ "- SLX-Error vs OLS, i.e., both $\\lambda = 0$ and $\\gamma = 0$ in the SLX-Error model: argumentes are **ols1** and **slxerr**\n",
+ "- SLX-Error model vs Error model, i.e., $\\gamma = 0$ in the SLX-Error model: arguments are **err1** and **slxerr**\n",
+ "- SLX-Error model vs SLX, i.e., $\\lambda = 0$ in the SLX-Error model: arguments are **slx1** and **slxerr**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c3b9c76",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "LR_Error = likratiotest(ols1,err1)\n",
+ "LR_SLXO = likratiotest(ols1,slxerr)\n",
+ "LR_SLXE = likratiotest(err1,slxerr)\n",
+ "LR_SLXS = likratiotest(slx1,slxerr)\n",
+ "\n",
+ "print(f\"LR statistic Error-OLS: {LR_Error[\"likr\"]:0.3f}, d.f. {LR_Error[\"df\"]:2d}, p-value {LR_Error[\"p-value\"]:0.4f}\")\n",
+ "print(f\"LR statistic SLX-Err-OLS: {LR_SLXO[\"likr\"]:0.3f}, d.f. {LR_SLXO[\"df\"]:2d}, p-value {LR_SLXO[\"p-value\"]:0.4f}\")\n",
+ "print(f\"LR statistic SLX-Err-Error: {LR_SLXE[\"likr\"]:0.3f}, d.f. {LR_SLXE[\"df\"]:2d}, p-value {LR_SLXE[\"p-value\"]:0.4f}\")\n",
+ "print(f\"LR statistic SLX-Err-SLX: {LR_SLXS[\"likr\"]:0.3f}, d.f. {LR_SLXS[\"df\"]:2d}, p-value {LR_SLXS[\"p-value\"]:0.4f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ca6ed963-0f88-4354-8236-a94beba50e4a",
+ "metadata": {},
+ "source": [
+ "In the current example, all null hypotheses are strongly rejected. \n",
+ "\n",
+ "For the error model in this example, the LM-Error test statistic was 88.33, the Wald test was 9.560^2 or 91.38, and the LR test (above) 72.72. Whereas the LR and Wald test follow the prescribed order (LM < LR < W), the LM-Lag test does not, which may point to potential remaining specification problems.\n",
+ "\n",
+ "As mentioned, the model can be refined by selectively setting `slx_vars`, but this is not pursued here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b086a22",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "As practice, different model specifications could be considered, including adding additional explanatory variables, selectively removing some lag terms, and using different spatial weights. "
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/16_GMM_estimation_spatial_error.ipynb.txt b/_sources/notebooks/16_GMM_estimation_spatial_error.ipynb.txt
new file mode 100644
index 00000000..b77e0bd7
--- /dev/null
+++ b/_sources/notebooks/16_GMM_estimation_spatial_error.ipynb.txt
@@ -0,0 +1,490 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "89145993",
+ "metadata": {},
+ "source": [
+ "# GMM Estimation - Spatial Error Model\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/26/2024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a85427f9",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "This module covers the estimation of spatial error models by means of the general method of moments (GMM). As for the IV estimation of the spatial lag model, this allows for both exogenous and endogenous explanatory variables. The estimates for the spatial error parameter are obtained as the solution of a set of moment conditions. As of version 1.4 of *spreg*, all spatial error estimation is implemented through the `spreg.GMM_Error` command. This is essentially a wrapper around the original implementations of the various `GM_Error`, `GM_Endog_Error` and `GM_Combo` functions (that still work exactly as before) with a more simplified interface.\n",
+ "Beyond the classic spatial error specification, more complex models can be estimated by including `slx_lags` (for additional WX) or `add_wy` (for inclusion of a spatially lagged dependent variable). This yields a range of higher order models, such as SLX Error, SARSAR (spatial lag with spatial autoregressive errors), and the generalized nested specification (GNS), i.e., a Spatial Durbin model with spatial autoregressive errors. The latter two models are treated in a separate notebook."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9a2c9eb",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The main module continues to be *spreg* for spatial regression analysis. From this, `GMM_Error` is imported. In addition, the utilities in *libpysal* (to open spatial weights and access the sample data set), *pandas* and *geopandas* are needed. All of these rely on *numpy* as a dependency. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "5ac490b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "from libpysal.io import open\n",
+ "from libpysal.examples import get_path\n",
+ "from libpysal.weights import lag_spatial\n",
+ "\n",
+ "from spreg import GMM_Error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0ee18820",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - read_file\n",
+ " \n",
+ "- from libpysal:\n",
+ " - io.open\n",
+ " - examples.get_path\n",
+ "\n",
+ "- from spreg:\n",
+ " - spreg.GMM_Error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "47934d1b-7587-4ddf-be38-83904eede8e8",
+ "metadata": {},
+ "source": [
+ "### Variable definition and data input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfb04af4-4521-4ac0-8e55-d3b92f54a403",
+ "metadata": {},
+ "source": [
+ "The data set and spatial weights are again from the **chicagoSDOH** sample data set:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic indicators of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_q.gal**: queen contiguity weights\n",
+ "\n",
+ "To illustrate the methods, the same descriptive model is used as in the ML notebook. It relates the rate of uninsured households in a tract(for health insurance, **EP_UNINSUR**) to the lack of high school education (**EP_NOHSDP**), the economic deprivation index (**HIS_ct**), limited command of English (**EP_LIMENG**) and the lack of access to a vehicle (**EP_NOVEH**). This is purely illustrative of a spatial error specification and does not have a particular theoretical or policy motivation.\n",
+ "\n",
+ "In an alternative specification, **HIS_ct** is considered to be endogenous, with, as before, **COORD_X** and **COORD_Y** as instruments.\n",
+ "\n",
+ "The file names and variable names are set in the usual manner. Any customization for different data sets/weights and different variables should be specified in this top cell."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "14a1e98e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "infileq = get_path(\"Chi-SDOH_q.gal\") # queen contiguity weights created with GeoDa\n",
+ "\n",
+ "y_name = 'EP_UNINSUR'\n",
+ "x_names = ['EP_NOHSDP','HIS_ct','EP_LIMENG','EP_NOVEH']\n",
+ "xe_names = ['EP_NOHSDP','EP_LIMENG','EP_NOVEH']\n",
+ "yend_names = ['HIS_ct']\n",
+ "q_names = ['COORD_X','COORD_Y']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "w_name = 'Chi-SDOH_q'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9514984-a22e-4d5b-a993-f40771a290a0",
+ "metadata": {},
+ "source": [
+ "The `read_file` and `open` functions are used to access the sample data set and contiguity weights. The weights are row-standardized and the data frames for the dependent and explanatory variables are constructed. As before, this functionality is agnostic to the actual data sets and variables used, since it relies on the specification given in the initial block above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "3248d23d-2247-40bf-a3a5-3ba21b477aa8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "wq = open(infileq).read() \n",
+ "wq.transform = 'r' # row-transform the weights\n",
+ "y = dfs[y_name]\n",
+ "x = dfs[x_names]\n",
+ "yend = dfs[yend_names]\n",
+ "xe = dfs[xe_names]\n",
+ "q = dfs[q_names]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8a0c186-dd30-4f64-9a5b-d95bb562ebde",
+ "metadata": {},
+ "source": [
+ "## GMM Estimation of the Error Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cc74112-a2eb-4efa-9116-d1e5197d20ed",
+ "metadata": {},
+ "source": [
+ "### Principle"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dbed835b",
+ "metadata": {},
+ "source": [
+ "As in the ML case, estimation of the spatial error model by means of GMM is based on spatially weighted least squares, where a consistent estimate for $\\lambda$ is used to create the spatially filtered dependent and explanatory variables, $(I - \\hat{\\lambda} W)y$ and $(I - \\hat{\\lambda} W)X$. If the only interest is in obtaining consistent estimates for $\\beta$, then any consistent estimate for $\\lambda$ will do. The only property that matters is consistency, since, unlike what holds in the spatial lag case, the precision of the $\\lambda$ estimate does not affect the precision of the $\\beta$ estimates.\n",
+ "\n",
+ "In two classic papers by Kelejian and Prucha (1998,1999), it was shown how a consistent estimate can be obtained as the solution of a set of moment equations formulated in terms of the regression residuals and their spatial lags. However, this did not provide an asymptotic variance for the spatial parameter, and thus did not allow for inference. In a series of later papers (Kelejian and Prucha 2010, Arraiz et al. 2010, 2013, Drukker et al 2013), the approach was extended to also allow for heteroskedasticity of unknown form and to include an asymptotic variance matrix.\n",
+ "\n",
+ "The technical details are quite complex and are outlined in Chapter 9 of Anselin and Rey (2014). The upshot is that there are three main estimation methods: the original Kelejian-Prucha generalized momens (GM) approach, the generalized method of moments (GMM) approach with heteroskedastic errors; and the GMM approach with homoskedastic errors. In practice, the GMM-heteroskedastic approach is greatly preferred. It is also the default used by `GMM_Error`.\n",
+ "\n",
+ "The GM and GMM approaches can be extended to models that include endogenous variables by means of spatially weighted two stage least squares (SW2SLS). This estimator uses the same expression as standard 2SLS, but replaces the $Z$ matrix of exogenous and endogenous regressors by its spatially filtered counterpart, $Z - \\hat{\\lambda} WZ$. The instrument matrix $Q$ is unaffected.\n",
+ "\n",
+ "In both cases, inference in based on two asymptotic variance matrices: one for the regression coefficients and one for the spatial error coefficient."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1bfd7648-bdd1-4ebb-bcf5-4dc74782693d",
+ "metadata": {},
+ "source": [
+ "### Implementation methods"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4eec12f7",
+ "metadata": {},
+ "source": [
+ "GMM estimation of the spatial error model is implemented in `spreg.GMM_Error`. This requires the standard regression arguments (i.e., at a minimum, `y`, `x` and `w`, as well as `yend` and `q` for the endogenous case). Three methods are implemented, specified by means of the `estimator` argument.\n",
+ "\n",
+ "The default is `estimator = \"het\"` for the heteroskedastic-robust GMM method. Other options are `estimator = \"hom\"`, for GMM with homoskedastic errors, and `estimator = \"kp98\"`, for the legacy Kelejian-Prucha GM approach.\n",
+ "\n",
+ "In addition, there are some more technical options for the GMM methods. In practice, these are rarely needed. \n",
+ "\n",
+ "For GMM-heteroskedastic, the extra arguments include `step1c`, `max_iter` and `epsilon`. The default is `step1c = False`, `max_iter = 1` and `epsilon = 0.00001`. When set to `True`, `step1c` carries out an additional estimation step for the autoregressive coefficient after the solution of the initial set of moment equations, as in Arraiz et. al (2010). The default follows the later paper by Arraiz et al. (2013) and skips this step. It is possible to iterate the procedure by using the new/updated residuals in additional rounds of estimation by setting `max_iter` to a larger value than `1`. Typically, this is not needed. When there are additional iterations, `epsilon` is used as a convergence criterion to stop the procedure.\n",
+ "\n",
+ "For GMM-homoskedastic, the option `A1` determines the exact manner in which the first set of moment equations is constructed. This pertains to very technical details regarding the trace of a matrix. The default is `A1 = \"hom_sc\"`, which applies a scaling factor suggested in Drukker et al. (2013). Other options are `A1 = \"hom\"` for no scaling, and `A1 = \"het\"` for a slightly different matrix expression (details are given on p. 217 of Anselin and Rey 2014). In practice, the options seldom make much of a difference and the default is fine. There are no `step1c` or `max_iter` options for this case.\n",
+ "\n",
+ "The legacy `kp98` method has no special options."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71307ef9-cc53-46a6-866e-5bd521e4503e",
+ "metadata": {},
+ "source": [
+ "#### Estimator `het`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "799e1559",
+ "metadata": {},
+ "source": [
+ "GMM-heteroskedastic is the default, so the `estimator` argument does not need to be specified. The first illustration is for all default settings with only exogenous regressors."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1975d6b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "err1 = GMM_Error(y,x,w=wq,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(err1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "debc6278",
+ "metadata": {},
+ "source": [
+ "The estimate of $\\lambda$, 0.4677, is almost identical to that obtained by means of ML (0.451). The other characteristics of the model are very similar as well, with a pseudo $R^2$ of 0.634 (compared to 0.633). The interpretation of the other features of the regression object are the same as for the ML estimation, and will not be repeated here.\n",
+ "\n",
+ "Again, there is only one type of predicted value, contained in **predy**. There are two types of residuals, the classic residual, $u = y - X \\hat{\\beta}$, stored in **u**, and the spatially filtered residuals, $u - \\lambda W u$, stored in **e_filtered**. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "418df371",
+ "metadata": {},
+ "source": [
+ "To illustrate the (minor) effect of the additional arguments, `step1c` is next set to `True`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a0a22c12",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "err2 = GMM_Error(y,x,w=wq,\n",
+ " step1c=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(err2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "343e01de",
+ "metadata": {},
+ "source": [
+ "The estimate for $\\lambda$ is only marginally different, at 0.4722, with a slightly smaller standard error, but the fit is unaffected (in fact, in terms of pseudo $R^2$, it is slightly worse than before, 0.6337 vs. 0.6340).\n",
+ "\n",
+ "With `max_iter = 10`, the differences are again marginal."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "30acd3e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "err3 = GMM_Error(y,x,w=wq,\n",
+ " max_iter=10,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(err3.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2d3b5229-f169-4d15-aafe-416f325183a5",
+ "metadata": {},
+ "source": [
+ "#### Estimator `hom`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6ba06a23-111a-430e-bd60-ce54ee60a660",
+ "metadata": {},
+ "source": [
+ "In practice, ignoring heteroskedasticiy is typically not a good idea for cross-sectional regressions. The `hom` estimator is included here for the sake of completeness, but should usually not be considered. Only the default settings are illustrated."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e1ec4ab7-5eed-4c14-b046-2f9efff13642",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "err4 = GMM_Error(y,x,w=wq,\n",
+ " estimator=\"hom\",\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(err4.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff1ea981-669d-4ba0-a200-6f41c17717f0",
+ "metadata": {},
+ "source": [
+ "The spatial coefficient is again slightly different, but the effect on the $\\beta$ coefficient is marginal. The main differences are in the results for the standard errors. As mentioned, ignoring heteroskedasticity may yield an overly optimistic impression of precision."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0439c7c-771a-446f-8677-51570d7f8dea",
+ "metadata": {},
+ "source": [
+ "#### Estimator `kp98`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "846b5ac1-e4b0-4b99-8a56-7663403935d1",
+ "metadata": {},
+ "source": [
+ "The final estimator is the legacy `kp98` method, which is only included for completeness. Since it does not provide inference for the spatial parameter, it is otherwise not recommended for use in practice."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e485647-951e-4b62-80be-b1bb505b5027",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "err5 = GMM_Error(y,x,w=wq,\n",
+ " estimator=\"kp98\",\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(err5.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "34682841-1194-4f32-a14d-1aea4ce6b9e3",
+ "metadata": {},
+ "source": [
+ "Note how the output now does not list standard errors, z-statistics and p-value for the $\\lambda$ estimate. The regression estimates are essentially the same."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "708ffa42-763f-4ee2-acf0-2ab079795cfd",
+ "metadata": {},
+ "source": [
+ "## GMM Estimation of the SLX-Error Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5a27b8d3-13cb-4e6d-aef2-549f19c6dc6f",
+ "metadata": {},
+ "source": [
+ "As for ML, GMM estimation of the SLX-Error model is a special case of `spreg.GMM_Error`, with the additional argument of `slx_lags=1` (or a larger value). Everything else remains the same. More specifically, the three estimators of `het`, `hom` and `kp98` are again available. Only the default `het` is considered here. The results show only minor differences for the other methods. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d5bc8e4-3683-4bfd-a904-f944cc69a71b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxerr1 = GMM_Error(y,x,w=wq,slx_lags=1,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(slxerr1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73b650f7-2272-4f1a-8d2b-e6c007f45d3a",
+ "metadata": {},
+ "source": [
+ "In general, as before, there is little support for including all the WX terms after controlling for spatial error autocorrelation. Only the coefficient of **W_HIS_ct** is significant, but, as before, the value of the estimate is larger than that for the unlagged regressor.\n",
+ "\n",
+ "The spatial autoregressive coefficient, 0.422, is highly significant.\n",
+ "\n",
+ "As usual, further refinements of the model specification can be carried out by eliminating some lag terms by means of `slx_vars`, as in the standard SLX model. This is not further pursued here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aba39aea",
+ "metadata": {},
+ "source": [
+ "## Exogenous and Endogenous Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7e9b6a82",
+ "metadata": {},
+ "source": [
+ "Additional endogenous variables and associated instruments are added in the standard way by including `yend` and `q` as arguments. Note that in the example, the regression matrix is now set to `xe`, for only the exogenous variables. Everything else is the same as before. Only the default `estimator = \"het\"` is illustrated, with its default settings."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b7a9c39b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "enderr = GMM_Error(y,xe,w=wq,yend=yend,q=q,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(enderr.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b05d3b36",
+ "metadata": {},
+ "source": [
+ "The estimate for $\\lambda$ is again in the same general ballpark, but **HIS_ct** is no longer significant. As usual, the endogenous variable is listed as **Instrumented** as well as the **Instruments**.\n",
+ "\n",
+ "Finally, the SLX model can be estimated with additional endogenous variables, by including `yend` and `q` with `slx_lags=1`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "45855356",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slxenderr = GMM_Error(y,xe,w=wq,yend=yend,q=q,\n",
+ " slx_lags=1,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(slxenderr.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f6973349",
+ "metadata": {},
+ "source": [
+ "In this example, none of the WX terms end up being significant. In addition, of the original regressors, only **EP_LIMENG** remains as significant."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b086a22",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "As practice, different model specifications could be considered, including adding additional explanatory variables, selectively removing some lag terms, and using different spatial weights, in the same way as for the other models."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/17_GMM_higher_order.ipynb.txt b/_sources/notebooks/17_GMM_higher_order.ipynb.txt
new file mode 100644
index 00000000..00371787
--- /dev/null
+++ b/_sources/notebooks/17_GMM_higher_order.ipynb.txt
@@ -0,0 +1,354 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "89145993",
+ "metadata": {},
+ "source": [
+ "# GMM Estimation - Higher Order Models\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/26/2024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a85427f9",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "This module covers the estimation of higher order spatial models, such as SAR-SAR (spatial lag with spatial autoregressive errors) and the generalized nested specification (GNS), i.e., a Spatial Durbin model with spatial autoregressive errors. These specifications are estimated as special cases of `spreg.GMM_Error`, by including the argument `add_wy = True`, without or with `slx_lags = 1`.\n",
+ "\n",
+ "In general, these specifications should be avoided, but they are included here for the sake of completeness. As shown by Koley and Bera (2024), the full set of parameters in the GNS model is not identified, and ML cannot be applied. The SAR-SAR models suffers from similar problems, and its ML estimation typically has a very hard time to convert, switching back and forth between the estimates for $\\rho$ and $\\lambda$ (this is referred to by Bivand and Piras in the `R-spatialreg` package as the \"banana\" problem). As a result, ML estimation of these models is not included in *spreg*. However, it remains possible to estimate them by means of IV/GMM methods, although the results need to be interpreted with caution. Also, as it turns out, in practice, the results often do not make sense and are difficult to interpret."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9a2c9eb",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The same modules are needed as for the GMM estimation of the error model: `GMM_Error` imported from `spreg`, utilities in *libpysal* (to open spatial weights and access the sample data set), *pandas* and *geopandas*."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "5ac490b0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "from libpysal.io import open\n",
+ "from libpysal.examples import get_path\n",
+ "from libpysal.weights import lag_spatial\n",
+ "\n",
+ "from spreg import GMM_Error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0ee18820",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - read_file\n",
+ " \n",
+ "- from libpysal:\n",
+ " - io.open\n",
+ " - examples.get_path\n",
+ "\n",
+ "- from spreg:\n",
+ " - spreg.GMM_Error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "47934d1b-7587-4ddf-be38-83904eede8e8",
+ "metadata": {},
+ "source": [
+ "### Variable definition and data input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfb04af4-4521-4ac0-8e55-d3b92f54a403",
+ "metadata": {},
+ "source": [
+ "The data set and spatial weights are again from the **chicagoSDOH** sample data set. They are the same as for the GMM Error estimation:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic indicators of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_q.gal**: queen contiguity weights\n",
+ "\n",
+ "To illustrate the methods, the same descriptive model is used as in the ML notebook. It relates the rate of uninsured households in a tract(for health insurance, **EP_UNINSUR**) to the lack of high school education (**EP_NOHSDP**), the economic deprivation index (**HIS_ct**), limited command of English (**EP_LIMENG**) and the lack of access to a vehicle (**EP_NOVEH**). This is purely illustrative of a spatial error specification and does not have a particular theoretical or policy motivation.\n",
+ "\n",
+ "In an alternative specification, **HIS_ct** is considered to be endogenous, with, as before, **COORD_X** and **COORD_Y** as instruments.\n",
+ "\n",
+ "The file names and variable names are set in the usual manner. Any customization for different data sets/weights and different variables should be specified in this top cell."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "14a1e98e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "infileq = get_path(\"Chi-SDOH_q.gal\") # queen contiguity weights created with GeoDa\n",
+ "\n",
+ "y_name = 'EP_UNINSUR'\n",
+ "x_names = ['EP_NOHSDP','HIS_ct','EP_LIMENG','EP_NOVEH']\n",
+ "xe_names = ['EP_NOHSDP','EP_LIMENG','EP_NOVEH']\n",
+ "yend_names = ['HIS_ct']\n",
+ "q_names = ['COORD_X','COORD_Y']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "w_name = 'Chi-SDOH_q'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9514984-a22e-4d5b-a993-f40771a290a0",
+ "metadata": {},
+ "source": [
+ "The `read_file` and `open` functions are used to access the sample data set and contiguity weights. The weights are row-standardized and the data frames for the dependent and explanatory variables are constructed. As before, this functionality is agnostic to the actual data sets and variables used, since it relies on the specification given in the initial block above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "3248d23d-2247-40bf-a3a5-3ba21b477aa8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "wq = open(infileq).read() \n",
+ "wq.transform = 'r' # row-transform the weights\n",
+ "y = dfs[y_name]\n",
+ "x = dfs[x_names]\n",
+ "yend = dfs[yend_names]\n",
+ "xe = dfs[xe_names]\n",
+ "q = dfs[q_names]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8a0c186-dd30-4f64-9a5b-d95bb562ebde",
+ "metadata": {},
+ "source": [
+ "## The SAR-SAR Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cc74112-a2eb-4efa-9116-d1e5197d20ed",
+ "metadata": {},
+ "source": [
+ "### Exogenous variables only"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4eec12f7",
+ "metadata": {},
+ "source": [
+ "GMM estimation of the SAR-SAR model is implemented in `spreg.GMM_Error`. This requires the standard regression arguments (i.e., at a minimum, `y`, `x` and `w`, as well as `yend` and `q` for the endogenous case), as well as `add_wy = True`. The same three methods are implemented as for generic `GMM_Error`, but here only the default `estimator = \"het\"` will be considered. Also, none of the special options are included (see the GMM Error notebook for details)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "799e1559",
+ "metadata": {},
+ "source": [
+ "GMM-heteroskedastic is the default, so the `estimator` argument does not need to be specified. The first illustration is for all default settings with only exogenous regressors. As usual, there is the option to use higher order lags for the instruments, but this is not pursued here.\n",
+ "\n",
+ "Also, note that in constrast to the standard lag (and spatial Durbin) specifications, there is no `spat_impacts` option."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1975d6b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sarsar1 = GMM_Error(y,x,w=wq,\n",
+ " add_wy=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(sarsar1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "debc6278",
+ "metadata": {},
+ "source": [
+ "In this example, there was no evidence to include a spatial error term suggested by the AK test in the spatial lag model (see the notebook on IV estimation of the spatial lag model). As a result, it is not a surprise to find the coefficient $\\lambda$ not to be significant. Typical in the SAR-SAR model, the signs of $\\rho$ and $\\lambda$ tend to be opposite, which is difficult to interpret and may point to an identification problem."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e185d6b5",
+ "metadata": {},
+ "source": [
+ "### Exogenous and endogenous variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3d0b559",
+ "metadata": {},
+ "source": [
+ "An extension to include additional endogenous variables is carried out in the standard way. The endogenous variables and associated instruments are listed below the table with estimates. As usual, there is an option to include the spatial lags of the instruments (`True` by default)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "afa70961",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sarsar2 = GMM_Error(y,xe,w=wq,yend=yend,q=q,\n",
+ " add_wy = True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(sarsar2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b0f21d50",
+ "metadata": {},
+ "source": [
+ "The consideration of endogeneity makes **HIS_ct** insignificant (as well as **EP_NOHSDP**). The $\\lambda$ coefficient remains non-significant, but its sign changes."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "418df371",
+ "metadata": {},
+ "source": [
+ "## The GNS Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "05445788",
+ "metadata": {},
+ "source": [
+ "The GNS model is treated as an SLX-Error model with an additional spatially lagged dependent variable as a regressor. This is accomplished by setting both `slx_lags = 1` (or higher) and `add_wy = True` in the `GMM_Error` call. As in the SAR-SAR case, there is no `spat_impacts` option."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1aabde7b",
+ "metadata": {},
+ "source": [
+ "### Exogenous variables only"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a0a22c12",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gns1 = GMM_Error(y,x,w=wq,\n",
+ " slx_lags = 1, add_wy = True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(gns1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "343e01de",
+ "metadata": {},
+ "source": [
+ "The results are only provided as an illustration of the functionality. Again, the typical pattern emerges of opposite signs for $\\rho$ and $\\lambda$, but now only $\\lambda$ is significant. None of the SLX terms is significant at p=0.01, and of the original regressors, only **EP_LIMENG** remains significant."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "01449367",
+ "metadata": {},
+ "source": [
+ "### Exogenous and endogenous variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0f04f20",
+ "metadata": {},
+ "source": [
+ "Endogenous variables are included by specifying `yend` and `q` (the `x` argument is set to `xe`, for exogenous variables only)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "30acd3e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gns2 = GMM_Error(y,xe,w=wq,yend=yend,q=q,\n",
+ " add_wy = True,slx_lags = 1,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(gns2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9a89c73d",
+ "metadata": {},
+ "source": [
+ "In this example, the estimate of $\\rho$ is 0.9, which is highly suspicious. The estimate for $\\lambda$ is now marginally significant and positive. Of the other variables in the model, only **EP_LIMENG** remains as significant.\n",
+ "\n",
+ "Again, this highlights the caution that is needed when implementing this model. In general, it should be avoided and the model under consideration should be respecified in a different way."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b086a22",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "Since these models should be avoided, there is not much point in practicing them, other than to gain insight into the often conflicting (and confusing) indications provided by the parameter estimates. It is not because a model is not identified that no estimates can be obtained. However, those results are not necessarily (and usually not) meaningful."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/1_sample_data.ipynb.txt b/_sources/notebooks/1_sample_data.ipynb.txt
new file mode 100644
index 00000000..0358ca90
--- /dev/null
+++ b/_sources/notebooks/1_sample_data.ipynb.txt
@@ -0,0 +1,385 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# PySAL Sample Data Sets\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/06/2024)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, the installation and input of PySAL sample data sets is reviewed.\n",
+ "\n",
+ "A video recording is available from the GeoDa Center YouTube channel playlist *Applied Spatial Regression - Notebooks*, at https://www.youtube.com/watch?v=qwnLkUFiSzY&list=PLzREt6r1NenmhNy-FCUwiXL17Vyty5VL6."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a443690f",
+ "metadata": {},
+ "source": [
+ "### Prerequisites\n",
+ "\n",
+ "Very little is assumed in terms of prerequisites. Sample data files are examined and loaded with *libpysal* and *geopandas* is used to read the data. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6494b68c",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The three modules needed to work with sample data are *libpysal*, *pandas* and *geopandas*. \n",
+ "\n",
+ "Some additional imports are included to avoid excessive warning messages. With later versions of PySAL, these may not be needed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "e398e42f",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "import libpysal"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4deb9fda",
+ "metadata": {},
+ "source": [
+ "In order to have some more flexibility when listing the contents of data frames, the `display.max_rows` option is set to 100 (this step can easily be skipped, but then the listing of example data sets below will be incomplete)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dda117c5",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "pd.options.display.max_rows = 100\n",
+ "pd.options.display.max_rows"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functionality Used\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - read_file\n",
+ " \n",
+ "- from libpysal:\n",
+ " - examples.available\n",
+ " - examples.explain\n",
+ " - examples.load_example\n",
+ " - examples.get_path"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9b0c168",
+ "metadata": {},
+ "source": [
+ "### Input Files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "74ab0075",
+ "metadata": {},
+ "source": [
+ "All notebooks used for this course are organized such that the relevant filenames and variables names are listed at the top, so that they can be easily adjusted for use with your own data sets and variables. In this notebook, the use of PySAL sample data sets is illustrated. For other data sets, the general approach is the same, except that either the files must be present in the current working directory, or the full pathname must be specified. In later notebooks, only sample data sets will be used.\n",
+ "\n",
+ "Here, the **Chi-SDOH** sample shape file is illustrated. The specific file names are:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: a shape file (four files!) with socio-economic determinants of health for 2014 in 791 Chicago tracts\n",
+ "\n",
+ "In the other *spreg* notebooks, it is assumed that will you have installed the relevant example data sets using functionality from the *libpysal.examples* module. This is illustrated in detail here, but will not be repeated in the other notebooks. If the files are not loaded using the `libpysal.examples` functionality, they can be downloaded as individual files from https://github.com/lanselin/spreg_sample_data/ or https://geodacenter.github.io/data-and-lab/. You must then pass the full path to **infileshp** used as arguments in the corresponding `geopandas.read_file` command.\n",
+ "\n",
+ "The input file is specified generically as **infileshp** (for the shape file). "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4d4335bb",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "infileshp = \"Chi-SDOH.shp\" # input shape file with data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "45549ced",
+ "metadata": {},
+ "source": [
+ "## Accessing a PySAL Remote Sample Data Set "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd0db16e-fe3c-45c6-85d9-178d42d016c5",
+ "metadata": {},
+ "source": [
+ "### Installing a remote sample data set"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69b1e985",
+ "metadata": {},
+ "source": [
+ "All the needed files associated with a remote data set must be installed locally. The list of available remote data sets is shown by means of `libpysal.examples.available()`. When the file is also installed, the matching item in the **Installed** column will be given as **True**. \n",
+ "\n",
+ "If the sample data set has not yet been installed, **Installed** is initially set to **False**. For example, if the **chicagoSDOH** data set is not installed, item **79** in the list (**chicagoSDOH**), is given as **False**. Once the example data set is loaded, this will be changed to **True**.\n",
+ "\n",
+ "The example data set only needs to be loaded once. After that, it will be available for all future use in *PySAL* (not just in the current notebook), using the standard `get_path` functionality of `libpysal.examples`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "474197b2",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "libpysal.examples.available()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bee29a1d",
+ "metadata": {},
+ "source": [
+ "The contents of any `PySAL` example data set can be shown by means of `libpysal.examples.explain`. Note that this does **not** load the data set, but it accesses the contents remotely (you will need an internet connection). As listed, the data set is for 791 census tracts in Chicago and it contains 65 variables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "02d1d8a4",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "libpysal.examples.explain(\"chicagoSDOH\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f2748a31",
+ "metadata": {},
+ "source": [
+ "The example data set is installed locally by means of `libpysal.examples.load_example` and passing the name of the remote example. Note the specific path to which the data sets are downloaded, you will need that if you ever want to remove the data set."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f5f24ea0",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "libpysal.examples.load_example(\"chicagoSDOH\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9c737fc3",
+ "metadata": {},
+ "source": [
+ "At this point, when checking `available`, the data set is listed as **True** under **Installed**. As mentioned, the installation only needs to be carried out once."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a772500e",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "libpysal.examples.available()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "35566ba3",
+ "metadata": {},
+ "source": [
+ "### Reading Input Files from the Example Data Set"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73afed82",
+ "metadata": {},
+ "source": [
+ "The actual path to the files contained in the local copy of the remote data set is found by means of `libpysal.examples.get_path`. This is then passed to the *geopandas* `read_file` function in the usual way. Here, this is a bit cumbersome, but the command can be simplified by specific statements in the module import, such as `from libpysal.examples import get_path`. The latter approach will be used in later notebooks, but here the full command is used. \n",
+ "\n",
+ "For example, the path to the input shape file is (this may be differ somewhat depending on how and where PySAL is installed):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc1d2a01",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "libpysal.examples.get_path(infileshp)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d11c88fd",
+ "metadata": {},
+ "source": [
+ "As mentioned earlier, if the example data are not installed locally by means of `libpysal.examples`, the `get_path` command must be replaced by an explicit reference to the correct file path name. This is easiest if the files are in the current working directory, in which case just specifying the file names in **infileshp** etc. is sufficient.\n",
+ "\n",
+ "The shape file is read by means of the *geopandas* `read_file` command, to which the full file pathname is passed obtained from `libpysal.examples.get_path(infileshp)`. To check if all is right, the shape of the data set (number of observations, number of variables) is printed (using the standard `print( )` command), as well as the list of variable names (columns in *pandas* speak). Details on dealing with *pandas* and *geopandas* data frames are covered in a later notebook."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b8d93a92",
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "inpath = libpysal.examples.get_path(infileshp)\n",
+ "dfs = gpd.read_file(inpath)\n",
+ "print(dfs.shape)\n",
+ "print(dfs.columns)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "604e8ab2",
+ "metadata": {},
+ "source": [
+ "### Removing an Installed Remote Sample Data Set"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6974e1b6",
+ "metadata": {},
+ "source": [
+ "In case that for some reason the installed remote **chicagoSDOH** data set is no longer needed, it can be removed by means of standard linux commands (or equivalent, for other operating systems). For example, on a Mac or Linux-based system, one first moves to the directory where the files were copied to. This is the same path that was shown when `load_example` was executed. In the example for a Mac OS operating system, this was shown in **Downloading chicagoSDOH to /Users/luc/Library/Application Support/pysal/chicagoSDOH**.\n",
+ "\n",
+ "So, in a terminal window, one first moves to /Users/your_user_name/Library/'Application Support'/pysal (don't forget the quotes) on a Mac system (and equivalent for other operating systems). There, the **chicagoSDOH** directory will be present. It is removed by means of:\n",
+ " \n",
+ "`rm -r chicagoSDOH`\n",
+ " \n",
+ "Of course, once removed, it will have to be reinstalled if needed in the future."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "94d9a818-6ce8-4658-b51a-54ad7178c795",
+ "metadata": {},
+ "source": [
+ "## Practice"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b29d53d4-69c7-411f-887d-efb6bc6b426f",
+ "metadata": {},
+ "source": [
+ "If you want to use other PySAL data sets to practice the spatial regression functionality in *spreg*, make sure to install them using the instructions given in this notebook. For example, load the **Police** data set (item 52 in the list), which will be used as an example in later notebooks."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/2_data_input_output.ipynb.txt b/_sources/notebooks/2_data_input_output.ipynb.txt
new file mode 100644
index 00000000..20ad78ff
--- /dev/null
+++ b/_sources/notebooks/2_data_input_output.ipynb.txt
@@ -0,0 +1,795 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "6c2d40bf",
+ "metadata": {},
+ "source": [
+ "# Data Input/Output\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### 09/06/2024"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f411eb3f",
+ "metadata": {},
+ "source": [
+ "## Preliminaries"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3c764aed",
+ "metadata": {},
+ "source": [
+ "In this notebook, some elementary functionality is covered to carry out data input and output from and to different types of files. The key concept is a so-called *DataFrame*, a tabular representation of the data with observations as rows and variables as columns.\n",
+ "\n",
+ "This is implemented by means of *pandas* for generic text files (as well as many other formats) and *geopandas* for spatial data files (shape files or geojson files). The functionality will be illustrated with the **Police** sample data set that contains police expenditure data for Mississippi counties. It is assumed that this data has been installed using `libpysal.examples.load_example(\"Police\")`.\n",
+ "\n",
+ "A video recording is available from the GeoDa Center YouTube channel playlist *Applied Spatial Regression - Notebooks*, at https://www.youtube.com/watch?v=7yWOgPEBQmE&list=PLzREt6r1NenmhNy-FCUwiXL17Vyty5VL6&index=2."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "936a0938",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The work horse for spatial analysis in Python is the *PySAL* library. However, before addressing specific spatial functionality, the use of *pandas* and *geopandas* will be illustrated to load data into so-called data frames. In addition, *libpysal* is needed to access the sample data sets. All of these rely on *numpy* as a dependency.\n",
+ "\n",
+ "The full set of imports is shown below. Also, in this notebook, the `get_path` functionality of `libpysal.examples` is imported separately, without the rest of *libpysal*."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "db02c49b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "import geopandas as gpd\n",
+ "from libpysal.examples import get_path"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "203e4930",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from numpy:\n",
+ " - array\n",
+ " - shape\n",
+ " - tolist\n",
+ " - reshape\n",
+ "\n",
+ "- from pandas:\n",
+ " - read_csv\n",
+ " - head\n",
+ " - info\n",
+ " - list\n",
+ " - columns\n",
+ " - describe\n",
+ " - corr\n",
+ " - DataFrame\n",
+ " - concat\n",
+ " - to_csv\n",
+ " - drop\n",
+ " \n",
+ "- from geopandas:\n",
+ " - read_file\n",
+ " - to_file\n",
+ "\n",
+ "- from libpysal:\n",
+ " - get_path"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c3416ff",
+ "metadata": {},
+ "source": [
+ "### Files\n",
+ "\n",
+ "Data input and output will be illustrated with the **Police** sample data set. This data set contains the same information in several different formats, such as csv, dbf, shp and geojson, which will be illustrated in turn. The following files will be used:\n",
+ "\n",
+ "- **police.shp,shx,dbf,prj**: shape file (four files) for 82 counties\n",
+ "- **police.csv**: the same data in csv text format\n",
+ "- **police.geojson**: the spatial layer in geojson format\n",
+ "\n",
+ "All the files are defined here, and referred to generically afterwards, so that it will be easy to re-run the commands for a separate application. The only changes needed would be the file names and/or variable names (if needed)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "d82e2963",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infilecsv = \"police.csv\" # input csv file\n",
+ "outfilecsv = \"test1.csv\" # output csv file\n",
+ "infiledbf = \"police.dbf\" # input dbf file\n",
+ "outfiledbf = \"test2.csv\" # output dbf file\n",
+ "infileshp = \"police.shp\" # input shape file\n",
+ "outfileshp = \"test3.shp\" # output shape file\n",
+ "infilegeo = \"police.geojson\" # input geojson file\n",
+ "outfilegeo = \"test4.geojson\" # output geojson file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cbbc97ea",
+ "metadata": {},
+ "source": [
+ "## Text Files\n",
+ "\n",
+ "### Input\n",
+ "\n",
+ "The input file for csv formatted data is **infilecsv**. In the example, this is the csv file **police.csv**. The path to the installed sample data set is found with `get_path` (note the form of the `import` statement, which means that the full prefix `libpysal.examples` is not needed).\n",
+ "\n",
+ "The pandas command `read_csv` creates a data frame, essentially a data table. One of its attributes is `shape`, the dimension of the table as number of rows (observations) and number of columns (variables). `df.head( )` lists the first few rows of the actual table."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d6dd6bae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infilecsv)\n",
+ "df = pd.read_csv(inpath)\n",
+ "print(df.shape)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e56ab718",
+ "metadata": {},
+ "source": [
+ "### Contents"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e89780e4",
+ "metadata": {},
+ "source": [
+ "A technical way to see the contents of a *pandas* data frame is to use the `info` command. This gives the class, range of the index (used internally to refer to rows) and the data type of the variables (columns)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4e12746d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f9d50a2",
+ "metadata": {},
+ "source": [
+ "An arguably more intuitive sense of the contents of the data frame is to just list the names of all the variables. This can be accomplished several different ways, illustrating the flexibility of *pandas*. However, it is important to know what type of object the result of each operation yields. Depending on the approach, this could be a list, a pandas index object or a numpy array. Assuming the wrong type for the result can cause trouble.\n",
+ "\n",
+ "The following four approaches will each extract the column headers, but yield the result as a different type of object. This will determine how it can be further manipulated:\n",
+ "\n",
+ "- `list(df)`: creates a simple list with the variable names\n",
+ "\n",
+ "- `df.columns`: yields the columns as a pandas index object\n",
+ "\n",
+ "- `df.columns.values`: yields the columns as a numpy array\n",
+ "\n",
+ "- `df.columns.values.tolist( )`: yields the columns as a list, same as `list(df)`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c4ece4ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "varlist1 = list(df)\n",
+ "print(varlist1)\n",
+ "type(varlist1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dce1b796",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "varlist2 = df.columns\n",
+ "print(varlist2)\n",
+ "type(varlist2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "33196a5d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "varlist3 = df.columns.values\n",
+ "print(varlist3)\n",
+ "type(varlist3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e08f9aa3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "varlist4 = df.columns.values.tolist()\n",
+ "print(varlist4)\n",
+ "type(varlist4)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "31293509",
+ "metadata": {},
+ "source": [
+ "### Descriptive Statistics\n",
+ "\n",
+ "A quick summary of the data set is provided by the `describe` command."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ba2bfbb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5936652f",
+ "metadata": {},
+ "source": [
+ "### Extracting Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d167db2c",
+ "metadata": {},
+ "source": [
+ "Variables (columns) can easily be extracted from a dataframe by listing their names in a list and subsetting the data frame (there are other ways as well, but they will not be considered here). It is important to keep in mind that the result is a different view of the same data frame, which may not be what is expected. In fact, in many applications in the context of *spreg*, the result should be a numpy array. This requires an extra step to cast the data frame to an array object.\n",
+ "\n",
+ "Also, in many contexts, an additional variable may need to be added to the data frame. For example, this will be needed for regression residuals and predicted values in a later notebook. To illustrate some of the steps involved, the variable **COLLEGE** will be turned into its complement (i.e., percent population without a college degree) and subsequently added to the data frame. To illustrate some descriptive statistics, **POLICE** will be extracted as well.\n",
+ "\n",
+ "First, the variable names are put in a list to subset the data frame and check the type. Make sure to use double brackets, the argument to the subset [ ] is a list, so [[list of variable names in quotes, separated by commas]]. The result is a *pandas* data frame or series (one variable).\n",
+ "\n",
+ "Note: if you want to do this for your own data set, possibly using different variables and different expressions, you will need to adjust the code below accordingly. Typically, this is avoided in these notebooks, but here there is no option to make things totally generic."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7ef84f45",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1 = df[['POLICE','COLLEGE']]\n",
+ "type(df1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "964daaa4",
+ "metadata": {},
+ "source": [
+ "A more elegant approach and one that will make it much easier to reuse the code for different data sets and variables is to enter the variable names in a list first, and then pass that to subset the data frame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ab77cecf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "varnames = ['POLICE','COLLEGE']\n",
+ "df2 = df[varnames]\n",
+ "type(df2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "db80a141",
+ "metadata": {},
+ "source": [
+ "At this point, it is much more meaningful to get the descriptive statistics using `describe`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6cddcd1a",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "df2.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f67f53ea",
+ "metadata": {},
+ "source": [
+ "A correlation coefficient is obtained by means of the `corr` method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "83f15c59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.corr()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e99466c",
+ "metadata": {},
+ "source": [
+ "### Extracting Variables to a Numpy Array"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "10df9613",
+ "metadata": {},
+ "source": [
+ "As mentioned, when using variables in the context of **spreg** routines, they will often need to be numpy arrays, not a data frame. This is accomplished by means of the `numpy.array` function (`np.array` in the notation used here). The `shape` attribute is a check to make sure that the resulting matrices have the correct format. In the example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1eb1d639",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x1 = np.array(df[varnames])\n",
+ "print(x1.shape)\n",
+ "type(x1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cdab7c0e",
+ "metadata": {},
+ "source": [
+ "### Computations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0255cfee",
+ "metadata": {},
+ "source": [
+ "New variables (columns) can be added to an existing data frame by means of straightforward element by element computations. However, to do this within the data frame structure is a bit cumbersome, since the data frame name needs to be included for each variable. On the other hand, the result is immediately attached to the data frame. \n",
+ "\n",
+ "Alternatively, the computations can be carried out using the numpy array and subsequently attached to the data frame. However, for a one-dimensional result, the shape of the result is a one-dimensional numpy array, not a row or a column vector. To obtain the latter, the `reshape` command needs to be used.\n",
+ "\n",
+ "For example, to compute the complement of the percentage with a college degree (in column 1 of array **x1**), the second column of the array is subtracted from 100. The element-by-element computation gives the desired result, but not the correct shape."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4ee6a4eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "noncollege = 100.0 - x1[:,1]\n",
+ "noncollege"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87b3dc37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "noncollege.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "041ced4c",
+ "metadata": {},
+ "source": [
+ "The correct dimension is obtained by means of `reshape(-1,1)`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2beb3219",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "noncollege = noncollege.reshape(-1,1)\n",
+ "print(noncollege.shape)\n",
+ "noncollege[0:5,:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4af49803",
+ "metadata": {},
+ "source": [
+ "Note the extra brackets in the (82,1) column vector compared to the (82, ) numpy array above."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16331996",
+ "metadata": {},
+ "source": [
+ "### Concatenating Data Frames"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "47a9ceb6",
+ "metadata": {},
+ "source": [
+ "In order to add the result of the matrix calculation to the data frame, two steps are involved. First, the numpy array is turned into into a data frame using `pandas.DataFrame`, making sure to give meaningful names to the columns by means of the `columns` argument. Then the `pandas.concat` function is applied to join the two data frames together. One can of course combine the two operations into one line, but here they are kept separate for clarity. **NONCOLLEGE** is added as the last variable in the data frame.\n",
+ "\n",
+ "Note that `axis=1` is set as an argument to the `concat` function to make sure a column is added (`axis=0` is to add a row)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7b415460",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dd = pd.DataFrame(noncollege,columns=['NONCOLLEGE'])\n",
+ "df = pd.concat([df,dd],axis=1)\n",
+ "print(df.columns)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "48bae79a",
+ "metadata": {},
+ "source": [
+ "### Output"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8183fbf0",
+ "metadata": {},
+ "source": [
+ "If desired, the new data frame can be written to a csv file using the `to_csv` command. The only required argument is the filename. For example, with the generic file name **outfilecsv** as defined at the top of the notebook, the file will be written to the current working directory. Its contents can be examined with any text editor or by loading it into a spreadsheet program. \n",
+ "\n",
+ "To avoid the index numbers as a first unnamed column (i.e., the default row names), an extra argument is `index = False`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "a39e0475",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_csv(outfilecsv,index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d5bf3069",
+ "metadata": {},
+ "source": [
+ "### DBase Files (dbf)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65673215",
+ "metadata": {},
+ "source": [
+ "A common (but old) format for tabular data bases is the dBase format, with file extension dbf. Even though it is old (and, arguably, out of date), this format is still quite useful because it is used to store the data (attributes) in one of the common spatial data formats, the shape file popularized by ESRI (see below).\n",
+ "\n",
+ "As it happens, *pandas* is currently not able to read data from a dbf file directly into a data frame. Specialized packages exist that implement this functionality (like *simpledbf*). However, as it happens, *geopandas*, considered in more detail below, also reads dbf files by means of its `read_file` command. No special arguments are needed, since the file format is derived from the file extension.\n",
+ "\n",
+ "For example, to read the data from **police.dbf** (the same as in **police.csv**), the path to the sample data file **infiledbf** is found with `get_path` and passed to the `geopandas.read_file` command. The result is a **GeoDataFrame**, not a regular **DataFrame**. This is an artifact of the dbf file being in the same directory as the shape file. The same command applied to the dbf file in isolation will be a **DataFrame**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e8ca9603",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infiledbf)\n",
+ "dfdb = gpd.read_file(inpath)\n",
+ "print(dfdb.shape)\n",
+ "print(type(dfdb))\n",
+ "print(dfdb.columns)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "56c081ac",
+ "metadata": {},
+ "source": [
+ "A close look at the dimensions and the columns reveals an additional column (22 compared to 21) with column name `geometry`. This can be removed by means of the `drop(columns = \"geometry\")` command."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "70540bdd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfdb = dfdb.drop(columns = 'geometry')\n",
+ "print(dfdb.shape)\n",
+ "print(type(dfdb))\n",
+ "print(dfdb.columns)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53dc84ac",
+ "metadata": {},
+ "source": [
+ "Now, the dimension is tha same as for the csv file and the `geometry` column has disappeared. Also, the `type` of the result is a regular **DataFrame**.\n",
+ "\n",
+ "As mentioned, if the dbf file is in a directory without the presence of a spatial layer, the `geometry` column will not be present. In that case, the result is a regular **DataFrame**, NOT a **GeoDataFrame**.\n",
+ "\n",
+ "It is important to keep this in mind, since *pandas* has currently no support for writing dbf files, whereas *geopandas* only has support for writing dbf files that contain a `geometry` column. However, a *pandas* data frame can be written to a csv file as seen before, using `to_csv`. The input dbf file can thus be converted to a csv file, but any changes cannot be saved to another dbf file.\n",
+ "\n",
+ "In general, working with dbf files in isolation is to be discouraged."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "4f1e648e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfdb.to_csv(outfiledbf,index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c5e63b68",
+ "metadata": {},
+ "source": [
+ "## Spatial Data Files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ae3e939",
+ "metadata": {},
+ "source": [
+ "### Spatial Data\n",
+ "\n",
+ "Spatial data are characterized by the combination of locational information (the precise\n",
+ "definition of points, lines or areas) and so-called attribute information (variables). \n",
+ "\n",
+ "There are many formats to store spatial information, in files as well as in relational databases. To keep\n",
+ "things simple, first the so-called *shape file* format is considered, a standard supported by ESRI, one of the \n",
+ "major commercial GIS vendors. In addition, *geojson* will be covered as well, since it is an increasingly common open source format."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5e70392d",
+ "metadata": {},
+ "source": [
+ "### Reading a shape file\n",
+ "\n",
+ "The terminology is a bit confusing, since there is no such thing as *one* shape file, but there is instead\n",
+ "a collection of three (or four) files. One file has the extension **.shp**, one **.shx**, one **.dbf**, and\n",
+ "one **.prj** (with the projection information). The first three are required, the fourth one is optional,\n",
+ "but highly recommended. The files should all be in the same directory and have the same main file name.\n",
+ "\n",
+ "In Python, the easiest way to read shape files is to use *geopandas*. The command is `read_file`, followed by the file pathname in parentheses. The program is smart enough to figure out the file format from the file extension *.shp*. As we saw before for the dbf format, the result is a geopandas data frame, a so-called **GeoDataFrame**, say **dfs**, which is a *pandas* **DataFrame** with an additional column for the geometry.\n",
+ "\n",
+ "All the standard pandas commands also apply to a geopandas data frame.\n",
+ "\n",
+ "The example uses the **police.shp** sample file as the input file, as specified in `infileshp` at the top of the notebook. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e85db1bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infileshp)\n",
+ "dfs = gpd.read_file(inpath)\n",
+ "print(dfs.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ed98200b",
+ "metadata": {},
+ "source": [
+ "Note how the data frame has one more column than the one created from the csv file. This is the same as in the dbf example above. The last column is **geometry**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "20d61cd6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(dfs.columns)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "237a031d",
+ "metadata": {},
+ "source": [
+ "### Creating New Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "81350492",
+ "metadata": {},
+ "source": [
+ "Just as for a standard pandas data frame, variables can be transformed, new variables created and data frames merged. The commands are the same as before and will not be repeated here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e1d8cdd2",
+ "metadata": {},
+ "source": [
+ "### Reading a GeoJSON File"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "28bc5379",
+ "metadata": {},
+ "source": [
+ "Reading any of the supported spatial formats is implemented by the same `read_file` command. As mentioned, *geopandas* figures out the right format from the file extension. The result is identical to the one for the shape file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fbf89311",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infilegeo)\n",
+ "dfg = gpd.read_file(inpath)\n",
+ "print(dfg.shape)\n",
+ "print(type(dfg))\n",
+ "print(dfg.columns)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0598e91",
+ "metadata": {},
+ "source": [
+ "### Writing a GeoDataFrame"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c7a7c818",
+ "metadata": {},
+ "source": [
+ "The output is accomplished by the `to_file` command. This supports many different output formats, but the default is the ESRI shape file, so we do not have to specify any arguments other than the filename. Here, we use the output file name specified in `outfileshp`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "ff7d08e7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.to_file(outfileshp)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15ff186d",
+ "metadata": {},
+ "source": [
+ "Writing a geojson file works in exactly the same way, for example, using the output file specified in **outputgeo**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "fef50cad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfg.to_file(outfilegeo)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ad019ff",
+ "metadata": {},
+ "source": [
+ "## Practice"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "72ca1664",
+ "metadata": {},
+ "source": [
+ "Use your own data set or one of the PySAL sample data sets to load a spatial data frame, create some new variables, optionally get descriptive statistics and write out an updated data set. This type of operation will be used frequently in the course of the regression analysis, for example, to add predicted values and/or residuals to a spatial layer."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/3_basic_mapping.ipynb.txt b/_sources/notebooks/3_basic_mapping.ipynb.txt
new file mode 100644
index 00000000..4361d733
--- /dev/null
+++ b/_sources/notebooks/3_basic_mapping.ipynb.txt
@@ -0,0 +1,674 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "6c2d40bf",
+ "metadata": {},
+ "source": [
+ "# Basic Mapping\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### 09/06/2024"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f411eb3f",
+ "metadata": {},
+ "source": [
+ "## Preliminaries"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3c764aed",
+ "metadata": {},
+ "source": [
+ "There are many ways to create beautiful maps in Python using packages such as *folium* or *plotly*. In this notebook, the `plot` functionality of *geopandas* is illustrated, which is sufficient for most of our purposes. The functionality will be illustrated with the **Police** sample data set that contains police expenditure data for Mississippi counties. It is assumed that this data has been installed using `libpysal.examples.load_example(\"Police\")`.\n",
+ "\n",
+ "A video recording is available from the GeoDa Center YouTube channel playlist *Applied Spatial Regression - Notebooks*, at https://www.youtube.com/watch?v=rZ1Mw-hZcMY&list=PLzREt6r1NenmhNy-FCUwiXL17Vyty5VL6&index=3."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "936a0938",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "As before, the main modules are *geopandas* and *libpysal*. Specifically, *libpysal.examples* is used to get the path to the sample data. In addition, to save the maps to a file, *matplotlib.pyplot* is needed.\n",
+ "\n",
+ "The full set of imports is shown below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "db02c49b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "import geopandas as gpd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from libpysal.examples import get_path"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "203e4930",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from geopandas:\n",
+ " - read_file\n",
+ " - plot\n",
+ "\n",
+ "- from libpysal:\n",
+ " - get_path\n",
+ "\n",
+ "- from matplotlib.pyplot:\n",
+ " - savefig"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c3416ff",
+ "metadata": {},
+ "source": [
+ "### Files\n",
+ "\n",
+ "The mapping functionality will be illustrated with the same **Police** sample data set as used in the previous notebook. The following files will be used:\n",
+ "\n",
+ "- **police.shp,shx,dbf,prj**: shape file (four files) for 82 counties\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "d82e2963",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = \"police.shp\" # input shape file\n",
+ "inpath = get_path(infileshp)\n",
+ "dfs = gpd.read_file(inpath)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ac636ba",
+ "metadata": {},
+ "source": [
+ "## Getting Started"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7c30a057",
+ "metadata": {},
+ "source": [
+ "### Default Map"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "52cf67ae",
+ "metadata": {},
+ "source": [
+ "Before delving into customization, the default choropleth map created by the `plot` function applied to a **GeoDataFrame** is illustrated. A bare bones implementation only requires the variable (column) to be mapped and the argument `legend = True`. Without the latter, there will still be a map, but it will not have a legend, so there will be no guide as to what the colors mean."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b2f412ce",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.plot('POLICE',legend=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "01ce3b94",
+ "metadata": {},
+ "source": [
+ "Not exactly the prettiest thing in the world. A continuous color ramp as seen here is not recommended by cartographers. Also, the classification is such that too many observations have seemingly the same color. Finally, there is also this strange mention of ****.\n",
+ "\n",
+ "There are two important types of modifications that can be considered. One pertains to the fundamental characteristics of a choropleth map, the other to the way *matplotlib* constructs visualizations under the hood. The *geopandas* library relies on *matplotlib* so there is no need to `import` the latter explicitly, except when one wants to save the maps to a file. In any case, it helps to understand the *matplotlib* logic. This is considered first."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c12c0184",
+ "metadata": {},
+ "source": [
+ "### Matplotlib Logic"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "78e5979e",
+ "metadata": {},
+ "source": [
+ "The *matplotlib* library is extremely powerful and allows just about any type of customized visualization. It starts by setting up the basic parameters and then builds a graphic representation layer by layer. The terminology may seem a bit strange at first, but after a while, it becomes more familiar.\n",
+ "\n",
+ "A plot is initialized by assigning some parameters to the tuple `fig , ax`. It is important to realize that `fig` is about the figure makeup and `ax` is about the actual plots. For example, `fig` is used to specify how many subplots there need to be, how they are arranged and what their size is. Since the examples used here and in later notebooks will only produce a single plot, the `fig` aspect can be ignored and only `ax` is needed. In fact, for simple plots such as the maps in our applications, the specification of `ax` as such is not needed and the `plot` function can be applied directly to the GeoDataFrame. However, it remains important that the plot object is referred to as `ax` in many operations."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "579ab0f0",
+ "metadata": {},
+ "source": [
+ "An alternative way to set up the default map just shown is to explicitly assign it to an object `ax`, as `ax = dfs.plot( )` with the same arguments as before. To remove the x-y coordinates and box around the map, the method `set_axis_off()` is applied to the `ax` object. Using this setup also removes the **** listing. Otherwise, everything is still the same as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d50f07df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot('POLICE',legend = True)\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0d881ef0",
+ "metadata": {},
+ "source": [
+ "Note that the same result can be obtained without the explicit assignment to `ax` by simply applying the method to the `plot` object, as in the example below. Typically, the more explicit assignment is considered to be more readable, but it is mostly a matter of preference."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "09d54b24",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.plot('POLICE',legend=True).set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "392346c2",
+ "metadata": {},
+ "source": [
+ "## Map Design Characteristics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a505e9bf",
+ "metadata": {},
+ "source": [
+ "The purpose of a choropleth or thematic map is to visualize the spatial distribution of a variable over areal units. Choropleth comes from the Greek choros, which stands for region, so it is a map for regions. For our purposes, the proper design of a map has three important characteristics, which each translate into arguments to the `plot` function:\n",
+ "\n",
+ "- classification\n",
+ "\n",
+ "- color\n",
+ "\n",
+ "- legend"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c3619b7",
+ "metadata": {},
+ "source": [
+ "### Classification"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a3db6d84",
+ "metadata": {},
+ "source": [
+ "Arguably the most important characteristic is the classification used, i.e., how the continuous distribution of a given variable gets translated into a small number of discrete categories, or bins. This is exactly the same issue encountered in the design of histogram bins.\n",
+ "\n",
+ "The assignment of observations to distinct bins is done by the *mapclassify* library, which is part of the *PySAL* family. However, this is done under the hood by *geopandas* so that no separate `import` statement is needed.\n",
+ "\n",
+ "The classification is set by means of the `scheme` argument. Common options are `Quantiles` (for a quantile map), `EqualInterval` (for an equal intervals map), `NaturalBreaks` (for a natural breaks map), `StdMean` (for a standard deviational map), and `BoxPlot` (for a box map). All but the last two classifications require an additional argument for the number of bins, `k`. This is not needed for the standard deviational map and the box map, for which the breakpoints are derived from the data, respectively the standard deviation and the quartiles/hinge.\n",
+ "\n",
+ "The default hinge for the box map is 1.5 times the interquartile range. Other values for the hinge can be specified by setting a different value for the argument `hinge`, but this is typically not necessary. However, to pass this to the *geopandas* `plot` function it cannot just be set as `hinge = 3.0` as in *mapclassify*. In *geopandas* it is necessary to pass this in a `classification_kwds` dictionary, where the relevant parameters are set. For example, this would be `classification_kwds = {\"hinge\": 3.0}` for a hinge of 3 times the interquartile range.\n",
+ "\n",
+ "The default for the standard deviational map is to show all observations within one standard deviation below and above the mean as one category. To separate observations below and above the mean can be accomplished by setting the argument `anchor` to `True`. Again, this is done by means of the `classification_kwds` dictionary.\n",
+ "\n",
+ "Full details on all the classifications available through *mapclassify* and their use in *geopandas* can be found at https://geopandas.org/en/stable/docs/user_guide/mapping.html# and https://pysal.org/mapclassify/api.html.\n",
+ "\n",
+ "Each of the five cases is illustrated in turn. Note that the `column` argument is used to designate the variable to be mapped.\n",
+ "\n",
+ "The placement of the legend is managed by means of the `legend_kwds` argument (similar to `classification_kwds`). This is a dictionary that specifies aspects such as the location of the legend and how it is positioned relative to its anchor point. It also makes it possible to set a `title` for the legend, e.g., to set it to the variable that is being mapped.\n",
+ "\n",
+ "In the examples, the following arguments are used: `legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": \"\"}`. This is not totally intuitive, but it works. See https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.legend.html#matplotlib.axes.Axes.legend for details about the various legend customizations.\n",
+ "\n",
+ "Also note that the map uses the default color map. More appropriate color maps will be considered below."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bcae4d0a",
+ "metadata": {},
+ "source": [
+ "#### Quantile Map\n",
+ "\n",
+ "A simple six category quantile map is illustrated by setting `scheme = \"Quantiles\"` and `k=6`. The `legend` arguments now also include a `title`. In addition, two `ax` methods are used for a minor customization: `ax.set_title` to give the map a title and, as before, `ax.set_axis_off` to get rid of the box with x-y coordinates."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a7724f4c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'Quantiles',\n",
+ " k = 6,\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\" : \"Police\"}\n",
+ ")\n",
+ "ax.set_title(\"Quantiles\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "734148b7",
+ "metadata": {},
+ "source": [
+ "#### Maps with Set Number of Bins"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a96e9a4b",
+ "metadata": {},
+ "source": [
+ "Rather than repeating the single command for each type of map that needs the argument `k`, a small loop is constructed that creates each in turn. This is accomplished by putting the name for the respective `scheme` in a list and using that same name as the map title. The three types are `Quantiles`, `EqualInterval` and `NaturalBreaks`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ca52a103",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schemek = [\"Quantiles\",\"EqualInterval\",\"NaturalBreaks\"]\n",
+ "for i in schemek:\n",
+ " ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = i,\n",
+ " k = 6,\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": \"Police\"}\n",
+ " )\n",
+ " ax.set_title(i)\n",
+ " ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6fbc2acd",
+ "metadata": {},
+ "source": [
+ "Note the contrast in the visualization of the spatial distribution between the different classifications. It is important to keep in mind that each has pros and cons. For example, the quantile map yields an equal number of observations in each category, but the range of the categories can vary subtantially, resulting in the grouping of very disparate observations. In the example, this is the case for the top category, which ranges from 1,275 to 10,972.\n",
+ "\n",
+ "On the other hand, the range in an equal intervals map is the same for all categories, but as a result some bins may have very few or very many observations, as is the case here for the lowest bin.\n",
+ "\n",
+ "Finally, a natural breaks map uses an optimization criterion (essentially equivalent to k-means on one variable) to determine the grouping of observations. Both the number of observations in each bin and the range of the bins is variable."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "62839ace",
+ "metadata": {},
+ "source": [
+ "#### Maps with Predetermined number of Bins\n",
+ "\n",
+ "The standard deviational map and box map have a pre-set number of bins, depending on, respectively, standard deviational units and quantiles/interquantile range. Again, they are illustrated using a small loop."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3b56d7d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "schemenok = [\"StdMean\",\"BoxPlot\"]\n",
+ "for i in schemenok:\n",
+ " ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = i,\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'Police'}\n",
+ " )\n",
+ " ax.set_title(i)\n",
+ " ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c2b0cd96",
+ "metadata": {},
+ "source": [
+ "Both types of maps are designed to highlight outliers. In the standard deviational map, these are observations more than two standard deviations away from the mean, in the box map, the outliers are outside the hinge (1.5 times the interquartile range from the median). This can be customized by setting a different value for the hinge through the `classification_kwds` argument. For example, selecting only the most extreme observations is achieved by setting `classification_kwds = {\"hinge\": 3.0}`, as illustrated below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5de45eaa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'BoxPlot',\n",
+ " k = 6,\n",
+ " classification_kwds = {'hinge': 3.0},\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": \"Police\"}\n",
+ ")\n",
+ "ax.set_title(\"Box Map\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "df595929",
+ "metadata": {},
+ "source": [
+ "A standard deviational map with the categories below and above the mean shown is implemented with `classification_kwds = {\"anchor\" : True}`, as shown below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7e17b901",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'StdMean',\n",
+ " k = 6,\n",
+ " classification_kwds = {'anchor': True},\n",
+ " legend = True,\n",
+ " legend_kwds = {\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'Police'}\n",
+ ")\n",
+ "ax.set_title(\"Standard Deviational Map\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0a4e8ede",
+ "metadata": {},
+ "source": [
+ "Whereas the first three types of classifications have a color scheme that suggests a progression from low to high values, a so-called *sequential* legend, the standard deviational map and box map focus on differences from a central value. This requires a color map that highlights the move away from the center, a so-called *diverging* legend. In the examples shown so far, the categories were shown with the default sequential color map, which is not appropriate. The needed customizations are considered next."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "934e2cab",
+ "metadata": {},
+ "source": [
+ "### Color Map"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4ee73f79",
+ "metadata": {},
+ "source": [
+ "The color scheme for the map is set by means of the `cmap` argument. This refers to a *matplotlib* color map, i.e., a pre-determined range of colors optimized for a particular purpose. For example, this allows for a different color map to represent a sequential vs. a diverging legend.\n",
+ "\n",
+ "The full range of color maps can be found at https://matplotlib.org/stable/users/explain/colors/colormaps.html.\n",
+ "\n",
+ "For our purposes, a good sequential color map uses a gradation that goes from light to dark, either in the same color, such as `cmap=\"Blues\"`, or moving between colors, such as `cmap=\"YlOrRd\"`. For a diverging legend, going from one extreme color to another is preferred, e.g., dark blue to light blue and then to light red and dark red, as in `cmap=\"bwr\"`, or even more extreme, as in `cmap=\"seismic\"`.\n",
+ "\n",
+ "Some examples are shown below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d4eaf9c1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'Quantiles',\n",
+ " k = 6,\n",
+ " cmap = 'Blues',\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'Police'}\n",
+ ")\n",
+ "ax.set_title(\"Quantiles\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e523a1f3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'Quantiles',\n",
+ " k = 6,\n",
+ " cmap = 'YlOrRd',\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'Police'}\n",
+ ")\n",
+ "ax.set_title(\"Quantiles\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bfd213e8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'BoxPlot',\n",
+ " cmap = 'seismic',\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'Police'}\n",
+ ")\n",
+ "ax.set_title(\"Box Map\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc43ce36",
+ "metadata": {},
+ "source": [
+ "But notice when this is applied to the standard deviational map with `cmap = bwr`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c658bd19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'StdMean',\n",
+ " cmap = 'bwr',\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'Police'}\n",
+ ")\n",
+ "ax.set_title(\"Standard Deviational Map\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "75ec5a5a",
+ "metadata": {},
+ "source": [
+ "What happened? Many of the counties are invisible. The reason is that there is no borderline specified for the map. This final customization is considered next."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bb66c005",
+ "metadata": {},
+ "source": [
+ "### Final Customization"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "da719cbc",
+ "metadata": {},
+ "source": [
+ "As mentioned, the full range of *matplotlib* customizations is available to manipulate legends, colors and placement. For our purposes, one more map-specific element is of interest. As seen in the previous examples, the border between polygons is not clear or even non-existent. \n",
+ "\n",
+ "This can be fixed by setting the `edgecolor` and associated `linewidth` attributes. For example, with `edgecolor = \"Black\"`, the standard deviational map becomes more meaningful."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a6392b64",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'StdMean',\n",
+ " cmap = 'bwr',\n",
+ " edgecolor = \"Black\",\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'Police'}\n",
+ ")\n",
+ "ax.set_title(\"Standard Deviational Map\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0fd86ac4",
+ "metadata": {},
+ "source": [
+ "#### Saving the Map to a File"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2f317018",
+ "metadata": {},
+ "source": [
+ "So far, the maps are generated in the notebook, but are not separately available. To save a specific map to a file, the `matplotlib.pyplot.savefig` command is used. For example, to save the standard deviational map (or any other map) to a png format file, only the filename needs to be specified as an argument to `plt.savefig`. Optionally, to get higher quality figures, the number of dots per inch can be set by means of `dpi`. \n",
+ "\n",
+ "This is illustrated for the standard deviational map where a more subtle border line is obtained by setting the thickness with `linewidth = 0.2`. The quality is set to `dpi = 600`.\n",
+ "\n",
+ "The file will be in the current working directory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "61991782",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'POLICE',\n",
+ " scheme = 'StdMean',\n",
+ " cmap = 'bwr',\n",
+ " edgecolor = \"Black\",\n",
+ " linewidth = 0.2,\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'Police'}\n",
+ ")\n",
+ "ax.set_title(\"Standard Deviational Map\")\n",
+ "ax.set_axis_off()\n",
+ "plt.savefig(\"police_stdmean.png\",dpi=600)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "27a262bc",
+ "metadata": {},
+ "source": [
+ "Finally, a map with just the county borders is obtained with the `boundary.plot` command, where the color of the border line is controlled by `edgecolor` and the line thickness by `linewidth`, as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "98d3c839",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.boundary.plot(\n",
+ " edgecolor = \"Black\",\n",
+ " linewidth = 0.2,\n",
+ ")\n",
+ "ax.set_title(\"Map of County Boundaries\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ad019ff",
+ "metadata": {},
+ "source": [
+ "## Practice"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "72ca1664",
+ "metadata": {},
+ "source": [
+ "Use your own data set or one of the PySAL sample data sets to load a spatial data frame and experiment with various map types, color schemes and other customizations. Save each map to a file for inclusion in papers, reports, etc."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/4_spatial_weights.ipynb.txt b/_sources/notebooks/4_spatial_weights.ipynb.txt
new file mode 100644
index 00000000..051034bc
--- /dev/null
+++ b/_sources/notebooks/4_spatial_weights.ipynb.txt
@@ -0,0 +1,1424 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# Spatial Weights\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### 09/06/2024\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, basic operations pertaining to spatial weights are reviewed. Two major cases are considered: reading weights files constructed by other software, such as *GeoDa*, and creating weights from GeoDataFrames or spatial layers using the functionality in *libpysal.weights*. In addition, some special operations are covered, such as creating spatial weights for regular grids and turning a *PySAL* weights object into a full matrix. The computation of a spatially lagged variable is illustrated as well.\n",
+ "\n",
+ "A video recording is available from the GeoDa Center YouTube channel playlist *Applied Spatial Regression - Notebooks*, at https://www.youtube.com/watch?v=IbmTItot0q8&list=PLzREt6r1NenmhNy-FCUwiXL17Vyty5VL6&index=4."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6494b68c",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The main functionality is provided by the utilities in *libpysal* for spatial weights, and the functionality in *geopandas* for data input and output. All of these rely on *numpy* as a dependency.\n",
+ "\n",
+ "To simplify notation, the `libpysal.weights` module is imported as `weights`, and `get_path` and `open` are imported from respectively `libpysal.examples` and `libpysal.io`.\n",
+ "\n",
+ "The `warnings` module filters some warnings about future changes. To avoid some arguably obnoxious new features of *numpy* 2.0, it is necessary to include the `set_printoptions` command if you are using a Python 3.12 environment with numpy 2.0 or greater.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "e398e42f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import numpy as np\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "import geopandas as gpd\n",
+ "from libpysal.examples import get_path\n",
+ "from libpysal.io import open\n",
+ "import libpysal.weights as weights\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from numpy:\n",
+ " - array\n",
+ " - mean\n",
+ " - std\n",
+ " - flatten\n",
+ " - @\n",
+ "\n",
+ "- from geopandas:\n",
+ " - read_file\n",
+ " - astype\n",
+ " \n",
+ "- from libpysal.examples:\n",
+ " - get_path\n",
+ "\n",
+ "- from libpysal.io:\n",
+ " - open\n",
+ "\n",
+ "- from libpysal.weights:\n",
+ " - neighbors\n",
+ " - weights\n",
+ " - n\n",
+ " - min_neighbors, max_neighbors, mean_neighbors\n",
+ " - pct_nonzero\n",
+ " - asymmetry, asymmetries\n",
+ " - Kernel.from_file\n",
+ " - Queen.from_dataframe\n",
+ " - transform\n",
+ " - Queen.from_file\n",
+ " - KNN.from_dataframe\n",
+ " - symmetrize\n",
+ " - Kernel\n",
+ " - Kernel.from_shapefile\n",
+ " - lat2W\n",
+ " - full\n",
+ " - lag_spatial"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "67da216d",
+ "metadata": {},
+ "source": [
+ "### Files and Variables\n",
+ "\n",
+ "This notebook uses data on socio-economic correlates of health outcomes contained in the **chicagoSDOH** sample shape files and associated spatial weights. It is assumed that all sample files have been installed.\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic indicators of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_q.gal**: queen contiguity spatial weights from `GeoDa`\n",
+ "- **Chi-SDOH_k6s.gal**: k-nearest neighbor weights for k=6, made symmetric in `GeoDa`\n",
+ "- **Chi-SDOH_k10tri.kwt**: triangular kernel weights based on a variable bandwidth with 10 nearest neighbors from `GeoDa`\n",
+ "\n",
+ "As before, file names and variable names are specified at the top of the notebook so that this is the only part that needs to be changed for other data sets and variables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "12a910c4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = \"Chi-SDOH.shp\" # input shape file\n",
+ "infileq = \"Chi-SDOH_q.gal\" # queen contiguity from GeoDa\n",
+ "infileknn = \"Chi-SDOH_k6s.gal\" # symmetric k-nearest neighbor weights from GeoDa\n",
+ "infilekwt = \"Chi-SDOH_k10tri.kwt\" # triangular kernel weights for a variable knn bandwidth from GeoDa\n",
+ "outfileq = \"test_q.gal\" # output file for queen weights computed with libpysal\n",
+ "outfilek = \"test_k.kwt\" # outpuf file for kernel weights computed with libpysal\n",
+ "y_name = [\"YPLL_rate\"] # variable to compute spatial lag"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6051ceb7",
+ "metadata": {},
+ "source": [
+ "## Spatial Weights from a File (GeoDa)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "51160fd3",
+ "metadata": {},
+ "source": [
+ "Spatial weights are an essential part of any spatial autocorrelation analysis and spatial regression. Functionality to create and analyze spatial weights is contained in the `libpysal.weights` library.\n",
+ "The full range of functions is much beyond the current scope and can be found at https://pysal.org/libpysal/api.html.\n",
+ "\n",
+ "Only the essentials are covered here, sufficient to proceed\n",
+ "with the spatial regression analysis. Also, only the original `Weights` class is considered. A newer alternative is provided by the `Graph` class, but it is not further discussed here. Full details can be found at https://pysal.org/libpysal/user-guide/graph/w_g_migration.html.\n",
+ "\n",
+ "Arguably the easiest way to create spatial weights is to use the *GeoDa* software (https://geodacenter.github.io/download.html), which\n",
+ "provides functionality to construct a wide range of contiguity as well as distance\n",
+ "based weights through a graphical user interface. The weights information is stored as **gal**, **gwt** or **kwt** files. Importing these weights into *PySAL* is considered first.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "85f6ef20",
+ "metadata": {},
+ "source": [
+ "### Queen Contiguity Weights\n",
+ "\n",
+ "Contiguity weights can be read into PySAL spatial weights objects using the `read` function, after opening the file with `libpysal.io.open` (here, just `open`). This is applied to the queen contiguity weights created by `GeoDa`, contained in the file **infileq**, after obtaining its path using `get_path`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce630240",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infileq)\n",
+ "wq = open(inpath).read()\n",
+ "wq"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8519f624",
+ "metadata": {},
+ "source": [
+ "The result is a PySAL spatial weights object of the class `libpysal.weights.weights.W`. This object contains lists of `neighbors` and `weights` as well as many other attributes and methods. \n",
+ "\n",
+ "It is useful to remember that the `neighbors` and `weights` are dictionaries that use an ID variable or simple sequence number as the key. A quick view of the relevant keys is obtained by converting them to a `list` and printing out the first few elements."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "72590f7d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(list(wq.neighbors.keys())[0:5])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e69b779d",
+ "metadata": {},
+ "source": [
+ "This reveals that the keys are simple strings, starting at **'1'** and not at **0** as in the usual Python indexing. The IDs of the neighbors for a given observation can be listed by specifying the key. For example, for observation with ID='1', this yields:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ca74d6fc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wq.neighbors['1']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8772b742",
+ "metadata": {},
+ "source": [
+ "When an inappropriate key is used, an error is generated (recall that dictionaries have no order, so there are no sequence numbers). For example, here `1` is entered as an integer, but it should have been a string, as above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6b8f34cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wq.neighbors[1]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d40d3b5c",
+ "metadata": {},
+ "source": [
+ "The weights associated with each observation key are found using `weights`. For example, for observation with ID='1' this yields:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1342d4e9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wq.weights['1']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f645036",
+ "metadata": {},
+ "source": [
+ "At this point, all the weights are simply binary. Row-standardization is considered below."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f90c1984",
+ "metadata": {},
+ "source": [
+ "#### Weights Characteristics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "50ee4c2d",
+ "metadata": {},
+ "source": [
+ "A quick check on the number of observations, i.e., the number of rows in the weights matrix."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4a60409f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "wq.n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1751a0a5",
+ "metadata": {},
+ "source": [
+ "Minimum, maximum and average number of neighbors and percent non-zero (an indication of sparsity)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5de46f3",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "wq.min_neighbors,wq.max_neighbors,wq.mean_neighbors,wq.pct_nonzero"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c406929f",
+ "metadata": {},
+ "source": [
+ "There is no explicit check for symmetry as such, but instead the lack of symmetry can be assessed by means of the `asymmetry` method, or the list of id pairs with asymmetric weights is obtained by means of the `asymmetries` attribute."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "430a2f59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(wq.asymmetry())\n",
+ "print(wq.asymmetries)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "545d5999",
+ "metadata": {},
+ "source": [
+ "Since contiguity weights are symmetric by construction, the presence of an asymmetry would indicate some kind of error. This is not the case here."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aa2ee8a3",
+ "metadata": {},
+ "source": [
+ "### K-Nearest Neighbors Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e856073b",
+ "metadata": {},
+ "source": [
+ "Similarly, the symmetric knn weights (k=6) created by `GeoDa` can be read from the file **infileknn**:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a86f85ce",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infileknn)\n",
+ "wk6s = open(inpath).read()\n",
+ "wk6s"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4ea8c6c",
+ "metadata": {},
+ "source": [
+ "Some characteristics:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "74c91c22",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wk6s.n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f55fc305",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "print(wk6s.min_neighbors,wk6s.max_neighbors,wk6s.mean_neighbors,wk6s.pct_nonzero)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "212cd898",
+ "metadata": {},
+ "source": [
+ "Note how the operation to make the initially asymmetric k-nearest neighbor weights symmetric has resulted in many observations having more than 6 neighbors (`max_neighbors` is larger than 6). That is the price to pay to end up with symmetric weights, which is required for some of the estimation methods. We can list neighbors and weights in the usual way. As it turns out, the observation with key `1` is not adjusted, but observation with key `3` now has eight neighbors (up from the original six).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8fb35473",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wk6s.neighbors['1']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3bf5d207",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wk6s.neighbors['3']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69dc85a3",
+ "metadata": {},
+ "source": [
+ "### Kernel Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f523e10a",
+ "metadata": {},
+ "source": [
+ "Triangular kernel weights based on a variable bandwidth with 10 nearest neighbors created by `GeoDa` are contained in the file **infilekwt**. The properties of kernel weights are considered in more detail in a later notebook.\n",
+ "\n",
+ "The weights can be read in the usual fashion, by means of `libpysal.io.open`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "cc957469",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infilekwt)\n",
+ "kwtri = open(inpath).read()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7f9a159",
+ "metadata": {},
+ "source": [
+ "However, this does not give the desired result. The object is not recognized as kernel weights, but\n",
+ "as a standard spatial weights object, revealed by checking the `type`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8b8aa4de",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(type(kwtri))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b69738a",
+ "metadata": {},
+ "source": [
+ "Tthe kernel weights can be checked with the usual `weights` attribute. However, the values for the keys in this example are not characters, but simple integers. This is revealed by a quick check of the keys."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b738d26",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(list(kwtri.neighbors.keys())[0:5])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "70e0fafb",
+ "metadata": {},
+ "source": [
+ "Now, with the integer 1 as the key, the contents of the weights can be listed. Note the presence of the weights 1.0 (for the diagonal). All is fine, except that *PySAL* does not recognize the weights as kernel weights."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "996a5c2b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(kwtri.weights[1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c4584ce6",
+ "metadata": {},
+ "source": [
+ "The alternative, using the `weights.Kernel.from_file` method from `libpysal` has the same problem."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dbf96029",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "kwtri10f = weights.Kernel.from_file(inpath)\n",
+ "print(type(kwtri10f))\n",
+ "print(kwtri10f.weights[1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f9b8309",
+ "metadata": {},
+ "source": [
+ "#### Changing the class of weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d28c0700",
+ "metadata": {},
+ "source": [
+ "In this particular case, a hack is to force the class of the weights object to be a kernel weight. This is generally not recommended, but since the object in question has all the characteristics of kernel weights, it is safe to do so.\n",
+ "\n",
+ "It is accomplished by setting the attribute `__class__` of the weights object to `libpysal.weights.distance.Kernel`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0a323c47",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "kwtri10f.__class__ = weights.distance.Kernel\n",
+ "print(type(kwtri10f))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2efc8b5b",
+ "metadata": {},
+ "source": [
+ "## Creating Weights from a GeoDataFrame"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c193a5dd",
+ "metadata": {},
+ "source": [
+ "### Queen Contiguity Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "713be4b9",
+ "metadata": {},
+ "source": [
+ "In *PySAL*, the spatial weights construction is handled by `libpysal.weights`. The generic function is `weights..from_dataframe` with as arguments the geodataframe and optionally the `ids` (recommended). For the Chicago data, the ID variable is **OJECTID**. To make sure the latter is an integer (it is not in the original data frame), its type is changed by means of the `astype` method. \n",
+ "\n",
+ "The same operation can also create a contiguity weights file from a shape file, using `weigths..from_shapefile`, but this is left as an exercise."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "015c694f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infileshp)\n",
+ "dfs = gpd.read_file(inpath)\n",
+ "dfs = dfs.astype({'OBJECTID':'int'})\n",
+ "wq1 = weights.Queen.from_dataframe(dfs,ids='OBJECTID')\n",
+ "wq1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9029a443",
+ "metadata": {},
+ "source": [
+ "A quick check on the keys reveals these are integers."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9e404c2c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(list(wq1.neighbors.keys())[0:5])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9263cc3",
+ "metadata": {},
+ "source": [
+ "Again, some characteristics:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d0f697e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wq1.n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7cbad937",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(wq1.min_neighbors,wq1.max_neighbors,wq1.mean_neighbors,wq1.pct_nonzero)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "baf427f5",
+ "metadata": {},
+ "source": [
+ "The structure of the weights is identical to that from the file read from `GeoDa`. For example, the first set of neighbors and weights are:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e7321592",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(wq1.neighbors[1])\n",
+ "print(wq1.weights[1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7dfc46a3",
+ "metadata": {},
+ "source": [
+ "### Row-standardization"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0756cb5e",
+ "metadata": {},
+ "source": [
+ "As created, the weights are simply 1.0 for binary weights. To turn the weights into row-standardized form, a *transformation* is needed, `wq1.transform = 'r'`:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "931dda95",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wq1.transform = 'r'\n",
+ "wq1.weights[1]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b659247c",
+ "metadata": {},
+ "source": [
+ "### Writing a Weights File"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "05e387f1",
+ "metadata": {},
+ "source": [
+ "To write out the weights object to a GAL file, `libpysal.io.open` is used with the `write` method. The argument to the `open` command is the filename and `mode='w'` (for writing a file). The weights object itself is the argument to the `write` method.\n",
+ "\n",
+ "Note that even though the weights are row-standardized, this information is lost in the output file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "d0547d6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "open(outfileq,mode='w').write(wq1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1c928df1",
+ "metadata": {},
+ "source": [
+ "A quick check using the `weights.Queen.from_file` operation on the just created weights file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "681ef494",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wq1a = weights.Queen.from_file(outfileq)\n",
+ "print(wq1a.n)\n",
+ "print(list(wq1a.neighbors.keys())[0:5])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f26a5b6",
+ "metadata": {},
+ "source": [
+ "Note how the type of the key has changed from integer above to character after reading from the outside file. This again stresses the importance of checking the keys before any further operations.\n",
+ "\n",
+ "The weights are back to their original binary form, so the row-standardization is lost after writing the output file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bc1e60e9",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "wq1a.weights['1']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b4e53bca",
+ "metadata": {},
+ "source": [
+ "### KNN Weights\n",
+ "\n",
+ "The corresponding functionality for k-nearest neighbor weights is `weights.KNN.from_dataframe`. An important argument is `k`, the number of neighbors, with the default set to `2`, which is typically not that useful. Again, it is useful to include OBJECTID as the ID variable. Initially the weights are in binary form. As before, they are row-standardized.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2128e659",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wk6 = weights.KNN.from_dataframe(dfs,k=6,ids='OBJECTID')\n",
+ "print(wk6.n)\n",
+ "print(list(wk6.neighbors.keys())[0:5])\n",
+ "wk6"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e5173eb",
+ "metadata": {},
+ "source": [
+ "To compare the just created weights to the symmetric form read into **wk6s**, the list of neighbors for observation 3 is informative. It consists of a subset of six from the list of eight from the above symmetric knn weights."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "599541a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(wk6.neighbors[3])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d7b4f435",
+ "metadata": {},
+ "source": [
+ "The k-nearest neighbor weights are intrinsically asymmetric. Rather than listing all the pairs that contain such asymmetries, the length of this list can be checked using the `asymmetry` method."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1b112430",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "print(len(wk6.asymmetry()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "02aec84c",
+ "metadata": {},
+ "source": [
+ "KNN weights have a built-in method to make them symmetric: `symmetrize`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "239b43ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wk6s2 = wk6.symmetrize()\n",
+ "print(len(wk6.asymmetry()))\n",
+ "print(len(wk6s2.asymmetry()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd55eca5",
+ "metadata": {},
+ "source": [
+ "The entries are now the same as for the symmetric knn GAL file that was read in from `GeoDa`. For example, the neighbors of observation with key `3` are:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ea5eb99d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "print(wk6s2.neighbors[3])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9c828bbe",
+ "metadata": {},
+ "source": [
+ "Finally, to make them row-standardized, the same transformation is used."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e98d4fde",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wk6s2.transform = 'r'\n",
+ "wk6s2.weights[3]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5a380252",
+ "metadata": {},
+ "source": [
+ "## Kernel Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d805bada",
+ "metadata": {},
+ "source": [
+ "There are several ways to create the kernel weights that are used later in the course, for example to compute HAC standard errors in ordinary least squares regression. One is to create the weights in `GeoDa` and save them as a weights file with a **kwt** extension. However, currently, there is a bug in libpysal so that the proper class needs to be set explicitly.\n",
+ "\n",
+ "The alternative is to compute the weights directly with `PySAL`. This can be implemented in a number of ways. One is to create the weights using the `libpysal.weights.Kernel` function, with a matrix of x-y coordinates passed. Another is to compute the weights directly from the information in a shape file, using `libpysal.weights.Kernel.from_shapefile`.\n",
+ "\n",
+ "Each is considered in turn."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8b452302",
+ "metadata": {},
+ "source": [
+ "### Kernel Weights Computation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b3734b6f",
+ "metadata": {},
+ "source": [
+ "Direct computation of kernel weights takes as input an array of coordinates. Typically these are the coordinates of the locations, but it is a perfectly general approach and can take any number of variables to compute *general* distances (or economic distances). In the example, the X and Y coordinates contained in the geodataframe **dfs** are used as `COORD_X` and `COORD_Y`. \n",
+ "\n",
+ "First, the respective columns from the data frame are turned into a numpy array.\n",
+ "\n",
+ "The command to create the kernel weights is `libpysal.weights.Kernel`. It takes the array as the first argument, followed by a number of options. To have a variable bandwidth that follows the 10 nearest neighbors, \n",
+ "`fixed = False` (the default is a fixed bandwidth) and `k=10`. The kernel function is selected as `function=\"triangular\"` (this is also the default, but it is included here for clarity). Finally, the use of kernel weights in the HAC calculations requires the diagonals to be set to the value of one, achieved by means\n",
+ "of `diagonal=True`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b3a98748",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "coords = np.array(dfs[['COORD_X','COORD_Y']])\n",
+ "kwtri10 = weights.Kernel(coords,fixed=False,k=10,\n",
+ " function=\"triangular\",diagonal=True)\n",
+ "print(type(kwtri10))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80d86e62",
+ "metadata": {},
+ "source": [
+ "The result is an object of class `libpysal.weights.distance.Kernel`. This contains several attributes, such as the kernel function used."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "82abf5af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "kwtri10.function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b3a6d3c3",
+ "metadata": {},
+ "source": [
+ "A check on the keys. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff74853e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(list(kwtri10.neighbors.keys())[0:5])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19215410",
+ "metadata": {},
+ "source": [
+ "Note that the index starts at 0 and the keys are integers. The neighbors for the first observation:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f47c9c44",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "kwtri10.neighbors[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5e6ea4dc",
+ "metadata": {},
+ "source": [
+ "The kernel weights for the first observations:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "380606c6",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "kwtri10.weights[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "75e81a3c",
+ "metadata": {},
+ "source": [
+ "These are the same values as we obtained above from reading the kwt file, but now they are recognized as a proper kernel weights object."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc1b40f5",
+ "metadata": {},
+ "source": [
+ "### Kernel Weights from a Shape File"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "df575721",
+ "metadata": {},
+ "source": [
+ "Contiguity weights, distance weights and kernel weights can also be constructed directly from a shape file, using the relevant `from_shapefile` methods. For kernel weights, this can be based on either point coordinates or on the coordinates of polygon centroids to compute the distances needed. The relevant function is `libpysal.weights.Kernel.from_shapefile` with as its main argument the file (path) name of the \n",
+ "shape file involved. The other arguments are the same options as before. The shape file in **infileshp** is used as the input file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7cc1d8ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inpath = get_path(infileshp)\n",
+ "kwtri10s = weights.Kernel.from_shapefile(inpath,\n",
+ " fixed=False,k=10,\n",
+ " function=\"triangular\",diagonal=True)\n",
+ "print(type(kwtri10s))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "380bf1a0",
+ "metadata": {},
+ "source": [
+ "The result is of the proper type, contains the same structure as before, with matching function, neighbors and weights."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d776d966",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(kwtri10s.function)\n",
+ "print(list(kwtri10s.neighbors.keys())[0:5])\n",
+ "print(kwtri10s.neighbors[0])\n",
+ "print(kwtri10s.weights[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "948f1fb0",
+ "metadata": {},
+ "source": [
+ "### Writing the Kernel Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "03f68cfb",
+ "metadata": {},
+ "source": [
+ "We use the same method as for the queen weights to write the just constructed kernel weights to an outside kwt file. The output file is `outfilek`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "9ec284cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "open(outfilek,mode='w').write(kwtri10s)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d05c15d0",
+ "metadata": {},
+ "source": [
+ "Quick check:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f7ab1227",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "kk = weights.Kernel.from_file(outfilek)\n",
+ "print(type(kk))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "021da548",
+ "metadata": {},
+ "source": [
+ "So, the same problem as mentioned above persists for weights files written by *PySAL* and the proper class needs to be set explicitly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "183515cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "kk.__class__ = weights.distance.Kernel\n",
+ "print(type(kk))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0eeb182",
+ "metadata": {},
+ "source": [
+ "## Special Weights Operations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4c7a1d94",
+ "metadata": {},
+ "source": [
+ "A few special weights operations will come in handy later on. One is to create spatial weights for a regular grid setup, which is very useful for simulation designs. The other is to turn a spatial weights object into a standard numpy array, which can be be used in all kinds of matrix operations."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3507af4d",
+ "metadata": {},
+ "source": [
+ "### Weights for Regular Grids"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1686fa14",
+ "metadata": {},
+ "source": [
+ "The `weights.lat2W` operation creates rook contiguity spatial weights (the default, queen contiguity is available for `rook = False`) for a regular rectangular grid with the number of rows and the number of columns as the arguments. The result is a simple binary weights object, so row-standardization is typically needed as well.\n",
+ "\n",
+ "For a square grid, with **gridside=20** as the number of rows/columns, the result has dimension 400."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32e033b7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gridside = 20\n",
+ "wgrid = weights.lat2W(gridside,gridside,rook=True)\n",
+ "wgrid.n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80e3a3ae",
+ "metadata": {},
+ "source": [
+ "Quick check on the neighbor keys."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d19a637f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "print(list(wgrid.neighbors.keys())[0:5])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6979a434",
+ "metadata": {},
+ "source": [
+ "Since this is a square grid, the first observation, in the upper left corner, has only two neighbors, one\n",
+ "to the right (1) and one below (20 - since the first row goes from 0 to 19)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3d97e2ec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wgrid.neighbors[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfd62652",
+ "metadata": {},
+ "source": [
+ "Row-standardization yields the actual weights."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "75cc9d20",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wgrid.transform = 'r'\n",
+ "wgrid.weights[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b930a20a",
+ "metadata": {},
+ "source": [
+ "Any non-border cell has four neighbors, one to the left, right, up and down."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f4fe7013",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wgrid.weights[21]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "634ddd29",
+ "metadata": {},
+ "source": [
+ "### Weights as Matrices"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f79a8b0",
+ "metadata": {},
+ "source": [
+ "The `weights.full` operation turns a spatial weights object into a standard numpy array. The function returns a tuple, of which the first element is the actual matrix and the second consists of a list of keys. For actual matrix operations, the latter is not that useful.\n",
+ "\n",
+ "It is important to remember to always extract the first element of the tuple as the matrix of interest. Otherwise, one quickly runs into trouble with array operations.\n",
+ "\n",
+ "This is illustrated for the row-standardized queen weights **wq1** created earlier."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a8a19df1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wq1full, wqfkeys = weights.full(wq1)\n",
+ "print(type(wq1full),type(wqfkeys))\n",
+ "wq1full.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b47feef",
+ "metadata": {},
+ "source": [
+ "## Spatially Lagged Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "355c9d91",
+ "metadata": {},
+ "source": [
+ "Spatially lagged variables are essential in the specification of spatial regression models. They are the product of a spatial weight matrix with a vector of observations and yield new values as (weighted) averages of the values observed at neighboring locations (with the neighbors defined by the spatial weights).\n",
+ "\n",
+ "This is illustrated for the variable **y_name** extracted from the data frame. Its mean and standard deviation are listed using the standard `numpy` methods."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f357ab21",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y = np.array(dfs[y_name])\n",
+ "print(y.shape)\n",
+ "print(y.mean())\n",
+ "print(y.std())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f369e77",
+ "metadata": {},
+ "source": [
+ "The new spatially lagged variable is created with the `weights.lag_spatial` command, passing the weights object **wq1** and the vector of interest, **y**. Its important to make sure that the dimensions match. In particular, if the vector in question is not an actual column vector, but a one-dimensional array, the result will not be a vector, but an array. This may cause trouble in some applicaitons."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc5f2f0c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wy = weights.lag_spatial(wq1,y)\n",
+ "print(wy.shape)\n",
+ "print(wy.mean())\n",
+ "print(wy.std())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0d706e58",
+ "metadata": {},
+ "source": [
+ "The result is a column vector. The mean roughly corresponds to that of the original variable, but the spatially lagged variable has a smaller standard deviation. This illustrates the *smoothing* implied by the spatial lag operation.\n",
+ "\n",
+ "To illustrate the problem with numpy arrays rather than vectors, the original vector is flattened and then the `lag_spatial` operation is applied to it. Everything works fine, except that the result is an array, and not a column vector."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b45d600a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "yy = y.flatten()\n",
+ "print(yy.shape)\n",
+ "wyy = weights.lag_spatial(wq1,yy)\n",
+ "print(wyy.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "682469c5",
+ "metadata": {},
+ "source": [
+ "The same result can also be obtained using an explicit matrix-vector multiplication with the full matrix **wq1full** just created."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "622eb958",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wy1 = wq1full @ y\n",
+ "print(wy1.shape)\n",
+ "print(wy1.mean())\n",
+ "print(wy1.std())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65aadbff",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "Experiment with various spatial weights for your own data set or for one of the PySAL sample data sets. Create a spatially lagged variable for each of the weights and compare their properties, such as the mean, standard deviation, correlation between the original variable and the spatial lag, etc.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/5_OLS.ipynb.txt b/_sources/notebooks/5_OLS.ipynb.txt
new file mode 100644
index 00000000..3d1e683f
--- /dev/null
+++ b/_sources/notebooks/5_OLS.ipynb.txt
@@ -0,0 +1,1108 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# Basic Ordinary Least Squares Regression (OLS)\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/08/2024)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, the basic OLS regression and elementary regression diagnostics are reviewed. In addition, a number of concepts related to robust standard errors are covered.\n",
+ "\n",
+ "Technical details are treated in Chapter 5 in Anselin and Rey (2014). *Modern Spatial Econometrics in Practice*.\n",
+ "\n",
+ "Video recordings of the basics and non-spatial diagnostics are available from the GeoDa Center YouTube channel playlist *Applied Spatial Regression - Notebooks*:\n",
+ "- https://www.youtube.com/watch?v=x63dL2M5x_0&list=PLzREt6r1NenmhNy-FCUwiXL17Vyty5VL6&index=5\n",
+ "- https://www.youtube.com/watch?v=pimNfZyOyKk&list=PLzREt6r1NenmhNy-FCUwiXL17Vyty5VL6&index=6"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a443690f",
+ "metadata": {},
+ "source": [
+ "### Prerequisites\n",
+ "\n",
+ "Familiarity with *pandas*, *geopandas* and *libpysal* is assumed to read data files and manipulate spatial weights. In addition, the sample data set **chicagoSDOH** should be installed."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6494b68c",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The main module for spatial regression in PySAL is *spreg*. In addition *libpysal* is needed to handle the example data sets and spatial weights, and *pandas* and *geopandas* for data input and output. This notebook is based on version 1.7 of *spreg*. In order to make the graphs a bit nicer looking, *matplotlib.pyplot* is imported as well, although this is not strictly needed since it is behind the plotting functionality of *pandas* and *geopandas*. Importing *matplotlib.pyplot* also allows for some further customization of the plots and maps, but that is not pursued in detail here.\n",
+ "\n",
+ "Some additional imports are included to avoid excessive warning messages. With later versions of PySAL, these may not be needed. As before, in order to avoid some arguably obnoxious new features of *numpy* 2.0, it is necessary to include the `set_printoptions` command if you are using a Python 3.12 environment with numpy 2.0 or greater."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "e398e42f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from libpysal.io import open\n",
+ "from libpysal.examples import get_path\n",
+ "import libpysal.weights as weights\n",
+ "from spreg import OLS, f_stat, wald_test\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functionality Used\n",
+ "\n",
+ "- from numpy:\n",
+ " - array\n",
+ " - hstack\n",
+ " - identity\n",
+ " - zeros\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - read_file\n",
+ " - DataFrame\n",
+ " - concat\n",
+ " - to_file\n",
+ " - plot\n",
+ " \n",
+ "- from libpysal:\n",
+ "\n",
+ " - examples.get_path\n",
+ " - io.open\n",
+ " - weights.distance.Kernel\n",
+ " \n",
+ "- from spreg:\n",
+ " - OLS\n",
+ " - f_stat\n",
+ " - wald_test"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9b0c168",
+ "metadata": {},
+ "source": [
+ "### Input Files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "74ab0075",
+ "metadata": {},
+ "source": [
+ "All notebooks are organized such that the relevant filenames and variables names are listed at the top, so that they can be easily adjusted for use with your own data sets and variables.\n",
+ "\n",
+ "In the examples, the **Chi-SDOH** sample shape file is used, with associated kernel weights (for HAC standard errors). The specific file names are:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic determinants of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_k10tri.kwt**: triangular kernel weights based on a variable bandwidth with 10 nearest neighbors from `GeoDa`\n",
+ "\n",
+ "In this and the other *spreg* related notebooks, it is assumed that you have installed the **chicagoSDOH** example data set using `libpysal.examples.load_example(\"chicagoSDOH\")`. \n",
+ "\n",
+ "The input files are specified generically as **infileshp** (for the shape file) and **infilek** (for the kernel weights). In addition, optionally, an output file is specified to add predicted values and residuals to a shape file as **outfileshp**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "4d4335bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "infilek = get_path(\"Chi-SDOH_k10tri.kwt\") # triangular kernel weights from GeoDa\n",
+ "outfileshp = \"./testols.shp\" # output shape file with predicted values and residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f6613bdc",
+ "metadata": {},
+ "source": [
+ "### Variable Names"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "06db06ed",
+ "metadata": {},
+ "source": [
+ "In this notebook, the regression model is illustrated in the context of the *immigrant paradox*. This is based on\n",
+ "four variables from the SDOH data set: **YPLL_rate** (an index measuring premature mortality, i.e., higher values are worse health outcomes), **HIS_ct** (economic hardship index), **Blk14P** (percent Black population), and **Hisp14P** \n",
+ "(percent Hispanic population).\n",
+ "\n",
+ "The easiest way to specify a generic regression model is to first create lists with the variable names for the dependent variable (**y_name**), the explanatory variables (**x_names1** and **x_names2**), the data set name (optional, as **ds_name**), the name of the contiguity weights (optional, as **w_name**, but not needed in this notebook), and the file name for the kernel weights (optional, as **gwk_name**). In this way, all the regression commands pertain to these generic variable names and do not need to be adjusted for different specifications and/or data sets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "69476f7b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_name = 'YPLL_rate'\n",
+ "x_names1 = ['Blk14P','Hisp14P']\n",
+ "x_names2 = ['Blk14P','Hisp14P','HIS_ct']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "gwk_name = 'Chi-SDOH_k10tri'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "35566ba3",
+ "metadata": {},
+ "source": [
+ "### Reading Input Files from the Example Data Set"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73afed82",
+ "metadata": {},
+ "source": [
+ "The actual path to the files contained in the local copy of the remote data set was found above by means of the *libpysal* `get_path` command. This is then passed to the *geopandas* `read_file` function in the usual way.\n",
+ "\n",
+ "As mentioned earlier, if the example data are not installed locally by means of `libpysal.examples`, the `get_path` command must be replaced by an explicit reference to the correct file path name. This is easiest if the files are in the current working directory, in which case just specifying the file names in **infileshp** etc. is sufficient."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b8d93a92",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "print(dfs.shape)\n",
+ "print(dfs.columns)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ba18d32",
+ "metadata": {},
+ "source": [
+ "### Reading Weights "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f009a3bb",
+ "metadata": {},
+ "source": [
+ "The weights are read by means of the `libpysal.io.open` command which was imported with a `open` alias. In the example, the kernel weights are loaded from a file created with *GeoDa*, and the full path was specified above using `get_path`. After reading the weights, their `class` is adjusted to avoid raising an exception in the computation of HAC standard errors as illustrated in the spatial weights ntoebook (note that `libpysal.weights` was imported as `weights`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b2a933b3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wk = open(infilek).read()\n",
+ "print(\"dimension \",wk.n)\n",
+ "wk.__class__ = weights.distance.Kernel\n",
+ "print(\"type of wk \",wk.__class__)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8f3b8116",
+ "metadata": {},
+ "source": [
+ "### Setting up the Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e310f464",
+ "metadata": {},
+ "source": [
+ "In legacy *spreg*, variables were typically read from a *dBase* data file (associated with a shape file). This required a multi-step process to extract the variables and then turn them into *numpy* arrays. With the more recent support for *pandas* and *geopandas*, this process has become greatly simplified. The variables can be loaded directly from the *pandas* or *geopandas* data frame. As a side effect, this no longer requires the `name_y` and `name_x` arguments to be set explicitly in the OLS call.\n",
+ "\n",
+ "The setup uses the variable names specified in the respective **y_name**, **x_names**, etc. lists. \n",
+ "\n",
+ "Note that there is no constant term in the x matrices. A constant vector is included by default in the `spreg.OLS` routine."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "1b1e2614-aea5-4eb5-86d4-de90665c2e0d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y = dfs[y_name]\n",
+ "x1 = dfs[x_names1]\n",
+ "x2 = dfs[x_names2]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "17b17262",
+ "metadata": {},
+ "source": [
+ "## OLS Regression"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "edf37d1b",
+ "metadata": {},
+ "source": [
+ "A first illustration uses the simplest of OLS regressions, where only y and X are specified as arguments in the `spreg.OLS` command. Since `OLS` was imported explicitly, the `spreg` part is omitted in what follows.\n",
+ "\n",
+ "The resulting OLS object has many attributes. An easy (the easiest) way to list the results is to print the `summary` attribute.\n",
+ "\n",
+ "First, the regression with the two ethnicity explanatory variables is considered. The dependent variable is **y** and the explanatory variables are contained in **x1**.\n",
+ "\n",
+ "The option `nonspat_diag=False` must be set, since the default will provide the diagnostics, which are considered in more detail below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "de3d5549",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1 = OLS(y,x1,nonspat_diag=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ef9d40af",
+ "metadata": {},
+ "source": [
+ "The basic result of this operation is to create an OLS object. The range of attributes and methods included can be readily viewed by means of the `dir` command."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "29d10e41",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(dir(ols1))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "446bc1da",
+ "metadata": {},
+ "source": [
+ "The regression coefficients are in `betas`, the predicted values in `predy`, residuals in `u`, the coefficient variance-covariance matrix in `vm`, and the $R^2$ measure of fit in `r2`. The `summary` method provides a nice looking output listing."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e69dccca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(ols1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7731d364",
+ "metadata": {},
+ "source": [
+ "As is, the output does not list any information on the data set. The variable names were extracted directly from the data frame. In order to have a more informative output, `name_ds` should be specified in the arguments, as illustrated below. Since no spatial diagnostics are specified so far, the **Weights matrix** is listed as **None**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "28ce15e1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1a = OLS(y,x1,nonspat_diag=False,\n",
+ " name_ds=ds_name)\n",
+ "print(ols1a.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a00aa4ab",
+ "metadata": {},
+ "source": [
+ "This bare bones regression output lists the variables and data set involved. In addition to the coefficient estimates, standard errors, t-statistics and p-values, it includes the $R^2$ and adjusted $R^2$ as measures of fit. The mean and standard deviation of the dependent variable are given as well. \n",
+ "\n",
+ "The overall fit of about 0.60 is reasonable and both slope coefficients are positive and highly significant. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dbcde13a-142c-4ecc-8fb4-db9fe8f5fdab",
+ "metadata": {},
+ "source": [
+ "### Alternative way to pass arguments"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8bf21520-8492-47c7-b040-6bf2b39ae94b",
+ "metadata": {},
+ "source": [
+ "The approach taken in the example, whereby **y** and **x1** as passed is mostly for convenience. In practice, one can specify the dependent and explanatory variables directly as subsets from a data frame. For example, one could use `dfs[\"YPLL_rate\"]` and `dfs[[\"Blk14P\",\"Hisp14P\"]]` directly as arguments instead of taking the extra step to define **y** and **x1** separately. The results are identical. Which approach one takes is a matter of preference."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0d6b3599-62aa-41e9-9cfc-a34c403dbb04",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1b = OLS(dfs[\"YPLL_rate\"],dfs[[\"Blk14P\",\"Hisp14P\"]],nonspat_diag=False,\n",
+ " name_ds=ds_name)\n",
+ "print(ols1b.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5cca34cc-bb02-4275-8a9d-9bd53b17a95d",
+ "metadata": {},
+ "source": [
+ "### Immigrant paradox"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "11b0c2e4-cff1-4090-a2e6-4813133652a1",
+ "metadata": {},
+ "source": [
+ "As it turns out, the initial regression result is somewhat misleading, which is revealed when the economic hardship variable is included. This is implemented by specifying **x2** as the x-variable argument."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "71eaba25",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols2 = OLS(y,x2,nonspat_diag=False,\n",
+ " name_ds=ds_name)\n",
+ "print(ols2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a3beb65b",
+ "metadata": {},
+ "source": [
+ "The inclusion of economic hardship turned the coefficient of the Hispanic share from positive to\n",
+ "negative. This is the so-called *immigrant paradox*. All coefficients are highly significant, with an adjusted $R^2$ of 0.6316."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab78c8bc",
+ "metadata": {},
+ "source": [
+ "### Predicted Values and Residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6a36bc6c",
+ "metadata": {},
+ "source": [
+ "Two important attributes of the regression object are the predicted values and residuals. Unlike many of the other attributes, these are full-length column vectors of size n x 1. They can be extracted as the `predy` and `u` attributes. A quick check is included to make sure the dimensions are correct.\n",
+ "\n",
+ "For the residuals, the `mean()` is computed to confirm that it is zero."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aa1829a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "yp = ols2.predy\n",
+ "yp.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c2b5c0dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "resid = ols2.u\n",
+ "print(resid.mean())\n",
+ "resid.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "83537680-abd9-444d-9cf4-8990cf2238ae",
+ "metadata": {},
+ "source": [
+ "#### Adding predicted values and residuals to the data frame"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "93cf9a3e",
+ "metadata": {},
+ "source": [
+ "The most useful application of the predicted values and residuals in a diagnostic sense is to plot and map them. Before briefly illustrating this, it is shown how the two vectors can be added to the data frame and then optionally written out to a new shape file. This uses the `Dataframe` functionality from *pandas* and horizontal vector stacking (`hstack`) from *numpy*."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "792e0ec7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preds = pd.DataFrame(np.hstack((yp,resid)),columns=['ypred','resid'])\n",
+ "preds.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0341e6e1",
+ "metadata": {},
+ "source": [
+ "The data frame with the predicted values and residuals is then concatenated (`pandas.concat`) with the current shape file and can be written to the output file **outfileshp** using the `to_file` function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "b0ba9486",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = pd.concat([dfs,preds],axis=1)\n",
+ "dfs.to_file(outfileshp,mode='w')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e7c13af0-0072-4ddb-bd04-03fea69785db",
+ "metadata": {},
+ "source": [
+ "#### Residuals diagnostic plot"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "52a86141-dc66-4c89-9956-06b3e36d20e5",
+ "metadata": {},
+ "source": [
+ "A common residual diagnostic plot is a scatter plot of the residuals against the predicted values. If the error variance is constant (homoskedasticity) the point cloud should be more or less parallel around the value of zero (the mean of the residuals). On the other hand, a fan-like shape or clearly different spreads for different ranges of the predicted values would suggest heteroskedasticity.\n",
+ "\n",
+ "The *pandas* `plot` functionality can be used to make a simple scatter plot of residuals against predicted values. In the example, **ypred** is used as the `x` argument, **resid** as the `y` argument, with `kind = \"scatter\"`. With *matplotlib.pyplot* imported, `plt.show()` will produce a clean graph. Alternatively, one can set the plot object to `ax`, as illustrated in the mapping notebook. This can be useful for further customization, but is not pursued here."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4d03ba70-6d2e-4d27-a05e-5ca793d2cb4d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs.plot(x=\"ypred\",y=\"resid\",kind='scatter',title='Diagnostic plot')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e2afd10c-bdf1-492e-b8db-dbee160a6ada",
+ "metadata": {},
+ "source": [
+ "The slight fan-like shape of the residuals with values further away from zero for increasing predicted values is a *visual* diagnostic suggesting heteroskedasticity. However, since this is purely visual, it remains only suggestive. More formal tests againsts heteroskedasticity are considered below."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eddaa4fa-cded-4385-b13d-41e361bc50c1",
+ "metadata": {},
+ "source": [
+ "#### Residual map"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "60f30322-6954-4e0c-82a5-3ce264c0dad4",
+ "metadata": {},
+ "source": [
+ "Some insight into possible spatial patterns among the residuals can be gained from a residual map, i.e., a choropleth map of the residuals. This is readily implemented by means of the `plot` functionality of a *geopandas* object. The most useful classification in this respect is a *standard deviational* map, for which the bins in the classification are based on standard deviational units. This has two advantages: (1) it clearly shows the positive and negative residuals; and (2) it highlights *outliers* as residuals that are more than two standard deviational units away from zero.\n",
+ "\n",
+ "As covered in a previous notebook, a standard deviational map is obtained by setting the argument `scheme = std_mean` in the *geopandas* `plot` function. This relies on the PySAL *mapclassify* package under the hood, but the latter does not need to be imported separately. As pointed out in the mapping notebook, in *mapclassify*, the convention is to merge all observations that are within one standard deviation below and above the mean into a single category. This may not be optimal for all types of maps (it can be customized by means of `anchor`), but it is useful for a residual map. Specifically, all observations that have residuals within one standard deviation from the mean (zero) are shown in the same color (here, white). To highlight the difference between negative and positive residuals, the color map `cmap=\"bwr\"`is chosen. In this scheme, the negative residuals are shades of blue (overprediction), whereas the positive residuals are shades of red (underprediction). The middle category is white.\n",
+ "\n",
+ "In order to make sure that the polygon boundaries are shown as well, `edgecolor = \"black\"` is set as an additional argument (otherwise all the census tracts in the middle category will not be able to be distinguished). The `linewidth` is adjusted somewhat to improve legibility. A legend is set in the usual way.\n",
+ "\n",
+ "Finally, to remove the default axes and to add a title, the `set_axis_off` and `set_title` methods are applied to the plot object. Further customization can always be carried out using more advanced features of *matplotlib*. For some examples, see the mapping notebook.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "23d268a9-d4cd-455d-99e3-a30c8569b79d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ax = dfs.plot(\n",
+ " column = 'resid',\n",
+ " figsize=(8,8),\n",
+ " cmap='bwr',\n",
+ " scheme='std_mean',\n",
+ " edgecolor='black',\n",
+ " linewidth = 0.3,\n",
+ " legend=True,\n",
+ " legend_kwds={'loc':'center left','bbox_to_anchor':(1,0.5), 'title': \"Residuals\"})\n",
+ "ax.set_axis_off()\n",
+ "ax.set_title('Residual Standard Deviational Plot')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4b4355ac-fb57-4adf-82ba-a538b823e3d6",
+ "metadata": {},
+ "source": [
+ "The residual map seems to suggest some spatial patterning among the residuals, but such a visual impression can be misleading. More formal approaches are considered in the notebook that deals with Specification Tests."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "245d9f28",
+ "metadata": {},
+ "source": [
+ "### Latex Table Output"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5a6da470",
+ "metadata": {},
+ "source": [
+ "In addition to the standard regression output shown so far, it is also possible to generate a latex-formatted table for the main coefficient results. This makes it easier to incorporate the regression results into reports or papers. It is accomplished by setting the `latex` option to `True` (the default is `False`).\n",
+ "\n",
+ "Note that only the table with the estimated coefficients, standard errors, etc. is in latex format. The other items (various headings and diagnostics) remain simple text."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "844c667a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols2lat = OLS(y,x2,nonspat_diag=False,\n",
+ " name_ds=ds_name,\n",
+ " latex=True)\n",
+ "print(ols2lat.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "edbf3d3e",
+ "metadata": {},
+ "source": [
+ "## Non-Spatial Diagnostics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41ac5682",
+ "metadata": {},
+ "source": [
+ "The default setting for OLS regression is to always include the non-spatial diagnostics, with `nonspat_diag=True`. Since this is the default, this argument does not have to be set. The default output with diagnostics will be as follows."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff0fc48a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols2a = OLS(y,x2,\n",
+ " name_ds=ds_name)\n",
+ "print(ols2a.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f8b1730b",
+ "metadata": {},
+ "source": [
+ "The previous listing is now agumented with several additional measures of fit (shown above the coefficients) as well as diagnostics for multicollinearity, non-normality and heteroskedasticity (listed below the coefficients).\n",
+ "\n",
+ "The measures of fit on the left-hand column are all related to the sum of squared residuals. This is listed, as well as two measures of $\\sigma^2$ (one from division by the degrees of freedom, the other - ML - from division by the number of observations) and their square roots, the S.E. of regression.\n",
+ "\n",
+ "On the right-hand side are the results of an F-statistic for the joint significance of the coefficients and its associated p-value, the log-likelihood $L$ (under the assumption of normality) for use in comparisions with spatial models, and two adjustments of the log-likelihood for the number of variables in the model, the $AIC$ and $SC$, with $AIC = -2L + 2k$ and $SC = -2L + k.ln(n)$ ($SC$ is sometimes referred to as $BIC$). Whereas a better fit is reflected in a higher log-likelihood, it is the reverse for AIC and SC (lower is better).\n",
+ "\n",
+ "Below the listing of coefficients are the multicollinearity condition number, the Jarque-Bera test on normality of the errors and two tests for heteroskedasticity (random coefficients): the Breusch-Pagan LM test and the more robust (against non-normality) Koenker-Bassett test.\n",
+ "\n",
+ "There is no evidence of a problem with multicollinearity (typically associated with values larger than 30), but a strong indication of both non-normality and heteroskedasticity."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0617e69",
+ "metadata": {},
+ "source": [
+ "### White test\n",
+ "\n",
+ "Because it requires additional computation, the White test against heteroskedasticity is not included by default. To include it, the argument `white_test=True` must be set.\n",
+ "\n",
+ "All the output is the same as before, except for the addition of the test results, which again strongly indicate the presence of heteroskedasticity."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "12992479",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols2b = OLS(y,x2,white_test=True,\n",
+ " name_ds=ds_name)\n",
+ "print(ols2b.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b118604d-7323-48f3-80ba-4a30031cf6f6",
+ "metadata": {},
+ "source": [
+ "### Variance Inflation Factor (VIF)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5c63b10f-47d1-44cb-bebb-5aa7946bc9b5",
+ "metadata": {},
+ "source": [
+ "The Variance Inflation Factor or VIF is an alternative to the multicollinearity condition number to assess the sensitivity of the regression results to the presence of multicollinearity. The condition number is a measure computed for all variables jointly and is a measure of the degree of linear dependence among all the columns of $X$. Instead, the VIF is computed for each variable separately. \n",
+ "\n",
+ "The VIF depends on the fit of a regression of the variable in question on all other x-variables, e.g., $R^2_k$ for $x_k$. $1 - R^2_k$ is called the *tolerance* (where $R^2_k$ is the unadjusted $R^2$). The more collinear $x_k$ is with the other variables, the larger $R^2_k$ and thus the lower the tolerance. The VIF is then the inverse of the tolerance.\n",
+ "\n",
+ "The VIF is not reported as part of the standard regression output, since it requires considerable additional computation, but is available by setting the argument `vif = True` (the default setting is `vif = False`). The output is augmented with a listing of the VIF and corresponding tolerance (its inverse) for each coefficient.\n",
+ "\n",
+ "While there is no indication of a multicollinearity problem in the current set of results, it may still be informative to check which variables are the most correlated with the others."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc516cc5-19af-420c-a37f-3e7b21d7658c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols2c = OLS(y,x2,vif=True,\n",
+ " name_ds=ds_name)\n",
+ "print(ols2c.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1cf27dc2-dffa-491a-86e2-92405d9e2957",
+ "metadata": {},
+ "source": [
+ "In this result, **Blk14P** has the highest VIF at 4.323. However, this is not very high. When there are more serious multicollinearity problems, this value can be much higher. For example, with $R^2_k = 0.95$, the VIF would be 20, and with $R^2_k = 0.99$, it would be 100.\n",
+ "\n",
+ "In the example, the corresponding tolerance is 0.231. In other words, a regression of **Blk4P** on the other two variables would have an (unadjusted) $R^2$ of 0.769. This can be readily verified in a simple regression by specifying the respective subsets of the **dfs** data frame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e807cb2e-9759-4206-b9f6-0256b7a4cf0f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "regv = OLS(dfs[\"Blk14P\"],dfs[[\"Hisp14P\",\"HIS_ct\"]],nonspat_diag = False)\n",
+ "print(regv.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4d18bdc0-ed6e-41de-b914-2e63a9a7828b",
+ "metadata": {},
+ "source": [
+ "The unadjusted $R^2$ in this regression is indeed 0.769."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f16811ab-6c0a-4add-9da3-6726e0b02c17",
+ "metadata": {},
+ "source": [
+ "### F-Test on joint significance of coefficients"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1dc85fe3-cc5c-4790-bd46-bb08378e8318",
+ "metadata": {},
+ "source": [
+ "The F-test reported as part of the regression output listing is a joint test on all slope coefficients. \n",
+ "It is also possible to test the joint significance of a subset of coefficients. \n",
+ "In order to carry this out, the variables of interest must be the *last* variables. \n",
+ "In other words, the test is on the joint significance of the **last df** coefficients in **betas**, \n",
+ "where **df** is passed as the second argument to `spreg.f_stat` (because of the way the `import` statement was phrased, here available as just `f_stat`). The first argument is a regression object.\n",
+ "One can apply this to one or more variables.\n",
+ "\n",
+ "For example, to test the significance of **HIS_ct** in the last regression, the F-test would have `df=1`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6b6cd61a-782a-43d0-8685-53a5103b9102",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "f_hisct = f_stat(ols2a,df=1)\n",
+ "print(f_hisct)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "76d87358-0f47-47ed-a0fd-33459c0b811f",
+ "metadata": {},
+ "source": [
+ "The result is an F-statistic with 1, 787 degress of freedom, 1 for the numerator (**df**) and 787 ($n - k$) for the denominator. The value of 65.796 is clearly significant, confirming the indication given by the t-test. Since the first degree of freedom of the F-statistic is 1, its value is exactly the square of the t-test in the regression, i.e., $8.11149^2 = 65.796$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b0b52e61-fb4f-4695-ab6d-1733ed55ab1c",
+ "metadata": {},
+ "source": [
+ "To replicate the result of the F test that is listed in the regression output, two different approaches are valid. In one, the degrees of freedom is set to 3 (the last three coefficients), in the other it is not specified. The default of `f_stat` is to carry out an F test on all slope coefficients. The results for the two approaches are identical and match the regression output."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c92d418d-39c2-4508-9307-d0507032e7cc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "f_all1 = f_stat(ols2a,df=3)\n",
+ "print(f_all1)\n",
+ "f_all2 = f_stat(ols2a)\n",
+ "print(f_all2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c136550b-8451-4faf-9dd4-f21db8709fe6",
+ "metadata": {},
+ "source": [
+ "Finally, to test the joint significance of Hisp14P and His_ct, **df** is set to 2."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bf373e8c-0e3c-42d4-850b-612470b3e122",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "f_two = f_stat(ols2a,df=2)\n",
+ "print(f_two)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0d8c1aa-67e7-4f51-bf67-3b50c0ba62e3",
+ "metadata": {},
+ "source": [
+ "### Wald test on joint significance of coefficients"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b05a5256-34e2-4695-b1d1-1f80a880ac6d",
+ "metadata": {},
+ "source": [
+ "The F-test as currently implemented is a bit awkward in that the joint test only applies to the last **df** coefficients. A more flexible approach is based on the textbook Wald test for linear restrictions on the regression coefficients (for example, as explained in Greene, 2012, p. 117-118). A set of linear constraints on the coefficients can be expressed as:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "R \\beta = q,\n",
+ "\\end{equation*}\n",
+ "\n",
+ "where $R$ is a $J \\times k$ matrix of constants, for $J$ constraints\n",
+ "and $k$ total parameters, $\\beta$ is the column vector with all the\n",
+ "model coefficients (i.e., including both the intercept and slopes), and\n",
+ "$q$ is a $J \\times 1$ column vector of zeros. In the case of a test on the joint significance of coefficients, the\n",
+ "matrix $R$ is a truncated identity matrix, with only the rows selected that pertain to the coefficients of interest.\n",
+ "This will be illustrated in more detail below.\n",
+ "\n",
+ "The test statistic takes the form of a general Wald statistic, or:\n",
+ "\\begin{equation*}\n",
+ "(R \\hat{\\beta})' [ R V R' ]^{-1} (R \\hat{\\beta}) \\sim \\chi^2(J),\n",
+ "\\end{equation*}\n",
+ "\n",
+ "i.e., following a chi-squared distribution with $J$ degrees of freedom, and\n",
+ "where $V$ is the estimated variance-covariance matrix for the coefficients, a $k \\times k$ matrix.\n",
+ "\n",
+ "The Wald test is available from `spreg.regimes.wald_test` (imported as just `wald_test` in this notebook). The arguments are `betas`, the vector with regression coefficients, `r`, the matrix $R$, `q`, the vector $q$, and `vm`, the estimated variance-covariance matrix for the regression. The result is a tuple with the test statistic and associated p-value."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c43d2368-0dfa-4dd5-a602-6323dbb5af94",
+ "metadata": {},
+ "source": [
+ "For example, taking the results from **ols2**, the regression coefficients would be in **ols2.betas** with associated variance-covariance matrix in **ols2.vm**. For a test on the joint significance of the coefficients of **Blk14P** and **HIS_ct**, `r` would be a $2 \\times 4$ matrix, with a 1 in position [0,1] and [1,3] (keep in mind that the count starts at 0). `q` would then be a $2 \\times 1$ column vector of zeros."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31de0d25-65bd-4e85-8dd2-00b9b936f7b6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rr = np.identity(ols2.k)\n",
+ "r = rr[(1,3),:]\n",
+ "r"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "03a3d28e-2768-432d-af6f-2316ca96a6bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "j = 2 # number of constraints\n",
+ "q = np.zeros((j,1))\n",
+ "q"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "728e157a-c30c-41d4-90dc-5e10c3dbe273",
+ "metadata": {},
+ "source": [
+ "The Wald statistic then follows as:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "76ee3ed1-43ad-4bef-9162-9ec4b6ab8516",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wald_test(ols2.betas,r,q,ols2.vm)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a81e06ea-3710-4acc-beae-aa597aa0321f",
+ "metadata": {},
+ "source": [
+ "The two coefficients are clearly jointly significant (one doesn't actually need a test to see that)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3cb17e54-f829-4eac-9864-58f6090f9440",
+ "metadata": {},
+ "source": [
+ "A final example illustrates the equivalence of the Wald test, the t-test and the F-test when testing a single coefficient. For the last coefficient, of **HIS_ct**, `q` equals 0 and `r` is a row vector with all zeros except for the last element, which is 1."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d15ec24b-77a8-4fc4-a44f-29ad87ee569a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "r = rr[3,:].reshape(1,-1)\n",
+ "q = 0\n",
+ "wald_test(ols2.betas,r,q,ols2.vm)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d911a27f-b360-4665-902a-538ab8229647",
+ "metadata": {},
+ "source": [
+ "The value of the statistic is the square of the t-statistic and the same as the F-test given above."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4b62234",
+ "metadata": {},
+ "source": [
+ "## Robust Standard Errors"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c29bcbbd",
+ "metadata": {},
+ "source": [
+ "The classical regression coefficient standard errors tend not to be very robust to deviations from the regularity conditions such as i.i.d. In cross-sectional data, it is rather unrealistic to assume the absence of heteroskedasticity. In fact, in the example used here, there is strong evidence of the presence of heteroskedasticity indicated by all three diagnostics.\n",
+ "\n",
+ "The so-called *White* standard errors (also known as Huber sandwich standard errors) correct for the presence of unspecified heteroskedasticity. They are almost always (but not always) larger than the classical standard errors, leading to a bit more conservative inference.\n",
+ "\n",
+ "A more recent development are the so-called *HAC* standard errors introduced by Kelejian and Prucha (2007)(\"HAC estimation in a spatial framework,\" *Journal of Econometrics* 140, 131-154), where HAC stands for heteroskedastic and autocorrelation consistent standard errors. The computations use a kernel logic to obtain standard errors that correct for *both* heteroskedasticity and the possible presence of spatial autocorrelation of unspecified form. Typically (but again not always) these are the largest standard errors.\n",
+ "\n",
+ "The robust standard errors are invoked with the option `robust` in `spreg.OLS`. The default is `None`, for classical standard errors."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "966bd8e9",
+ "metadata": {},
+ "source": [
+ "### White Standard Errors"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e6a028cc",
+ "metadata": {},
+ "source": [
+ "White standard errors are obtained by setting `robust='white'`. Except for the standard errors, the regression output remains exactly the same."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eecf3af1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols3 = OLS(y,x2,robust='white',\n",
+ " name_ds=ds_name)\n",
+ "print(ols3.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6dd1ca67",
+ "metadata": {},
+ "source": [
+ "In the example, the standard errors increase for all coefficients except for Blk14P (3.39 vs 3.49). However, the impact on inference is negligible, with all coefficients still significant at p < 0.003."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "99555c1d",
+ "metadata": {},
+ "source": [
+ "### HAC Standard Errors"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "382e64ee",
+ "metadata": {},
+ "source": [
+ "HAC standard errors are obtained with `robust='hac'`. In addition, a kernel weights object must be passed (`gwk`), and optionally, its name (`name_gwk`). Again, except for the standard errors, the output is the same as before. One slight difference is that no diagnostics for spatial dependence are implemented when the HAC option is used. In the example, the triangular kernel weights contained in **wk** are used. Several other kernel functions (and different bandwidths) can be used to create kernel weights through *libpysal.weights*. In practice, some experimentation is advised."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "45384072",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols4 = OLS(y,x2,robust='hac',\n",
+ " gwk=wk,name_gwk=gwk_name,\n",
+ " name_ds=ds_name)\n",
+ "print(ols4.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "897615b1",
+ "metadata": {},
+ "source": [
+ "In the example, the HAC standard errors are slightly larger than in the White case, but again not sufficient to affect inference."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65aadbff",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "At this point, it would be useful to set up your own baseline regression model, with a continuous dependent variable and at least two or three explanatory variables. You can pick any set of variables from the Chicago data set, from one of the PySAL sample data sets or your own data, but of course, make sure that your research question makes sense. Create some kernel weights (use `libpysal.weights`) to check out the HAC estimation.\n",
+ "\n",
+ "Assess the extent to which your initial specification may suffer from some forms of misspecification, as well as the sensitivity of your results to different measures of standard errors. In addition, test some linear restrictions on the coefficients.\n",
+ "\n",
+ "You may also want to experiment with the plotting and mapping functionality contained in *geopandas* to create plots and maps of the predicted values and/or residuals.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/6_TWOSLS.ipynb.txt b/_sources/notebooks/6_TWOSLS.ipynb.txt
new file mode 100644
index 00000000..e8e15436
--- /dev/null
+++ b/_sources/notebooks/6_TWOSLS.ipynb.txt
@@ -0,0 +1,711 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# Two Stage Least Squares Regression (2SLS)\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/08/2024)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, endogenous and instrumental variables are introduced and the basics of two stage least squares estimation are presented.\n",
+ "\n",
+ "Technical details are given in Chapter 6 in Anselin and Rey (2014). *Modern Spatial Econometrics in Practice*."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a443690f",
+ "metadata": {},
+ "source": [
+ "### Prerequisites\n",
+ "\n",
+ "Familiarity with *pandas*, *geopandas* and *libpysal* is assumed to read data files and manipulate spatial weights as well as knowledge of how to use *PySAL* sample data sets. Again, the **chicagoSDOH** sample data set is used. If not available, it must be installed first with `libpysal.examples.load_example(\"chicaogoSDOH\")`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6494b68c",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The main module for spatial regression in PySAL is *spreg*. In addition, *libpysal* is needed to handle the example data sets and spatial weights, and *pandas* and *geopandas* for data input and output. This notebook is based on version 1.7 of *spreg*. \n",
+ "\n",
+ "Some additional imports are included to avoid excessive warning messages. With later versions of PySAL, these may not be needed. Finally, to avoid issues with the `print` function for *numpy* 2.0 and later, the `legacy` option is set in `set_printoptions`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e398e42f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "from libpysal.io import open\n",
+ "from libpysal.examples import get_path\n",
+ "import libpysal.weights as weights\n",
+ "from spreg import OLS, TSLS, f_stat\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functionality Used\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - read_file\n",
+ " - DataFrame\n",
+ " - concat\n",
+ " \n",
+ "- from libpysal:\n",
+ " - examples.get_path\n",
+ " - io.open\n",
+ " - weights.distance.Kernel\n",
+ " \n",
+ "- from spreg:\n",
+ " - OLS\n",
+ " - TSLS\n",
+ " - f_stat"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9b0c168",
+ "metadata": {},
+ "source": [
+ "### Input Files"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "74ab0075",
+ "metadata": {},
+ "source": [
+ "All notebooks are organized such that the relevant filenames and variables names are listed at the top, so that they can be easily adjusted for use with your own data sets and variables.\n",
+ "\n",
+ "In the current notebook, the **Chi-SDOH** sample shape file is used, with associated kernel weights (for HAC standard errors). The specific file names are:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic determinants of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_k10tri.kwt**: triangular kernel weights based on a variable bandwidth with 10 nearest neighbors from *GeoDa*\n",
+ "\n",
+ "As before, the input files are specified generically as **infileshp** (for the shape file) and **infilek** (for the kernel weights). The `libpysal.examples.get_path` functionality is used to get the correct path."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "4d4335bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "infilek = get_path(\"Chi-SDOH_k10tri.kwt\") # triangular kernel weights from GeoDa"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f6613bdc",
+ "metadata": {},
+ "source": [
+ "### Variable Names"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "06db06ed",
+ "metadata": {},
+ "source": [
+ "The illustration in this notebook considers two different specifications. One has **YPLL_rate** (index of premature mortality) as the dependent variable and **Blk14P** (percent Black population), **Hisp14P** (percent Hispanic population), and **HIS_ct** (economic hardship index) as explanatory variables. The variable **HIS_ct** is considered to be *endogenous*, with the census tract centroids as instruments (**COORD_X** and **COORD_Y**). The full set of regressors is specified in **z_names1**, the exogenous variables in **xe_names**, endogenous variable in **yend_names1**, and instruments in **q_names1**.\n",
+ "\n",
+ "A second specification uses the same dependent variable (**y_name**) and exogenous regressors (**xe_names**), but now includes two endogenous regressors, **HIS_ct** and **EP_NOHDSP** (percent without a high school diploma) in **yend_names2** (with the full set of regressors in **z_names2**), and one additional instrument (in addition to the census tract centroids), **EP_LIMENG** (percent with limited English proficiency), specified in **q_names2**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "69476f7b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_name = ['YPLL_rate']\n",
+ "z_names1 = ['Blk14P','Hisp14P','HIS_ct']\n",
+ "z_names2 = ['Blk14P','Hisp14P','EP_NOHSDP','HIS_ct']\n",
+ "xe_names = ['Blk14P','Hisp14P']\n",
+ "yend_names1 = ['HIS_ct']\n",
+ "yend_names2 = ['EP_NOHSDP','HIS_ct']\n",
+ "q_names1 = ['COORD_X', 'COORD_Y']\n",
+ "q_names2 = ['COORD_X', 'COORD_Y','EP_LIMENG']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "gwk_name = 'Chi-SDOH_k10tri'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "35566ba3",
+ "metadata": {},
+ "source": [
+ "### Variable definition and data input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "73afed82",
+ "metadata": {},
+ "source": [
+ "The input geo data frame is created using `read_file` and the weights file using `libpysal.io` (imported as `open`). Also, the class of the kernel weights is corrected using `libpysal.weights.distance.Kernel` (`libpysal.weights` is imported as `weights`). Next, all the relevant variables are initialized as subsets from the data frame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b8d93a92",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "wk = open(infilek).read()\n",
+ "print(wk.n)\n",
+ "wk.__class__ = weights.distance.Kernel\n",
+ "print(type(wk))\n",
+ "\n",
+ "y = dfs[y_name]\n",
+ "z1 = dfs[z_names1]\n",
+ "z2 = dfs[z_names2]\n",
+ "xe = dfs[xe_names]\n",
+ "yend1 = dfs[yend_names1]\n",
+ "yend2 = dfs[yend_names2]\n",
+ "q1 = dfs[q_names1]\n",
+ "q2 = dfs[q_names2]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0762e5db-f024-412d-9691-d1e3916a446d",
+ "metadata": {},
+ "source": [
+ "## The Principle of Two Stage Least Squares"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d294d0d1-da43-44ee-a107-71258882eee7",
+ "metadata": {},
+ "source": [
+ "A fundamental assumption behind OLS is, loosely put, that the explanatory variables ($X$) are uncorrelated with the error terms. If this is not the case for one or more variables, the OLS estimates will be *biased* (in the early days, this used to be called simultaneous equation bias, now it is referred to as endogeneity). To correct for this bias, the violating *endogenous* variables are replaced by *instruments* that are: (1) uncorrelated with the error term; and (2) closely related (but not too close) to the original endogenous variables.\n",
+ "\n",
+ "For example, consider a set of explanatory variables, such that:\n",
+ "\\begin{equation*}\n",
+ "y = Z\\delta + u,\n",
+ "\\end{equation*}\n",
+ "where $Z$ is organized such that the exogenous variables $X$ are first and the endogenous ones $Y$ are second, as $Z = [X \\ Y]$.\n",
+ "\n",
+ "In essence, 2SLS estimation can be thought of as proceeding\n",
+ "in two stages (hence the acronym). In the first stage, each of the endogenous variables\n",
+ "is regressed on a matrix of instruments, $Q$, which includes all of the exogenous\n",
+ "variables in $X$ as well as specific *instruments*. The predicted values from this regression are then used as\n",
+ "explanatory variables, replacing the endogenous variables in the *second* stage, which yields consistent estimates\n",
+ "for the coefficients $\\delta$. Note that the predicted value in a regression of the $X$ variables on $Q$\n",
+ "(which includes the $X$) simply yields the original $X$, so that \n",
+ "there are in fact no new instrumental variables for the $X$. \n",
+ "\n",
+ "The first stage can be expressed succinctly as:\n",
+ "\\begin{equation*}\n",
+ "\\hat{Z} = Q [ (Q'Q)^{-1} Q'Z ],\n",
+ "\\end{equation*}\n",
+ "where the term in square brackets is the vector of OLS regression coefficients in a regression\n",
+ "of each of the $Z$ on the instrument matrix $Q$.\n",
+ "\n",
+ "The second stage consists of an OLS regression of $y$ on the predicted values\n",
+ "$\\hat{Z}$:\n",
+ "\\begin{equation*}\n",
+ "\\hat{\\delta} = (\\hat{Z'} \\hat{Z} )^{-1} \\hat{Z'} y.\n",
+ "\\end{equation*}\n",
+ "\n",
+ "Alternatively, substituting the results of the first regression yields\n",
+ "the full expression as:\n",
+ "\\begin{equation*}\n",
+ "\\hat{\\delta} = [ Z'Q (Q'Q)^{-1} Q'Z ]^{-1} Z'Q (Q'Q)^{-1} Q' y.\n",
+ "\\end{equation*}\n",
+ "\n",
+ "Strictly speaking, the 2SLS estimator is an instrumental variables estimator with instruments:\n",
+ "\\begin{equation*}\n",
+ "H = Q (Q'Q)^{-1} Q'Z,\n",
+ "\\end{equation*}\n",
+ "where $Q (Q'Q)^{-1} Q'$ is often referred to as a projection matrix. The estimator can\n",
+ "be expressed in the usual way as:\n",
+ "\\begin{equation*}\n",
+ "\\hat{\\delta} = (H'Z)^{-1} H'y,\n",
+ "\\end{equation*}\n",
+ "which, after substituting the full expression for $H$ and some matrix algebra yields \n",
+ "the same result as above.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d2b3f701-05ff-4064-859e-f3aa2c35554e",
+ "metadata": {},
+ "source": [
+ "## The Two Stages of 2SLS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "95ed9650-f8fa-477f-baa0-1ecf4a1e411a",
+ "metadata": {},
+ "source": [
+ "Before illustrating the `spreg.TSLS` functionality, the two stages of 2SLS are spelled out in detail, using the first regression example. First, as a reference point, the OLS estimation, using `spreg.OLS` with `nonspat_diag = False` to limit the output. The complete set of regressors is contained in **z1**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce2419e7-887b-40f2-ae7d-7129e6cc074a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1 = OLS(y,x=z1,nonspat_diag = False,name_ds = ds_name)\n",
+ "print(ols1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d14f8459-dc04-48b1-befb-201089605ace",
+ "metadata": {},
+ "source": [
+ "### Creating the instruments"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5c815b43-bf56-4575-8b5d-00e5bd179673",
+ "metadata": {},
+ "source": [
+ "In the example, there is only one endogenous variable, **HIS_ct**. The associated *instrumental variable* is the predicted value from a regression of **HIS_ct** on the exogenous variables **xe** and the instruments **q1**. The terminology can get a bit confusing, since instrumental variables and instruments are often used interchangeably. In a strict sense, the predicted value is an instrumental variable for the endogenous variable and the exogenous variables are often not explicitly designated as instruments. This is the meaning used here. So, the full set of instruments consists of *both* **xe** and **q1**. This is accomplished by means of *pandas* `concat` function.\n",
+ "\n",
+ "The instrumental variable for **HIS_ct** is then extracted as the `predy` attribute of the regression object and turned into a dataframe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "59b0cb8e-1285-4d31-b6c3-70f09cd8cb1a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "qq = pd.concat([xe,q1],axis=1)\n",
+ "olsinst = OLS(yend1,qq,nonspat_diag=False,name_ds=ds_name)\n",
+ "print(olsinst.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e9501f8-69d9-4929-99af-6ed5dafe2a0c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "yend_p = olsinst.predy\n",
+ "yep = pd.DataFrame(yend_p,columns=['HIS_ct_p'])\n",
+ "yep"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "173a37ac-8579-4188-a99c-ae8aa770d986",
+ "metadata": {},
+ "source": [
+ "### Second stage"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5bea8e71-26fd-4e1b-a7ed-98f3e0de1275",
+ "metadata": {},
+ "source": [
+ "The second stage consists of an OLS estimation of **YPLL_rate** on the three regressors, where **HIS_ct** is replaced by its predicted value. Again, first `concat` is used to put the exogenous variables together with the newly created predicted value."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "730c0601-1bd8-4768-9035-17c666225d44",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "zz = pd.concat([xe,yep],axis=1)\n",
+ "ols2 = OLS(y,zz,nonspat_diag = False, name_ds=ds_name)\n",
+ "print(ols2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4e5ff992-0404-4824-94a6-f6f8408e3df1",
+ "metadata": {},
+ "source": [
+ "Compared to the results of the original OLS regression (**ols1**), there are some marked differences in both the magnitude and significance of the coefficients. The coefficient for **Blk14P** is less than half its previous value (16.4 vs. 42.1) and is no longer significant. The coefficient for **Hisp14P** is three times as large (-45.7 vs. -14.6) and **HIS_ct_p** double (153.6 vs. 72.7). Both remain highly significant. However, as noted below, the standard errors and thus also the significance must be interpreted with caution."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2129c6c0-d1cd-438a-88ef-bdd2601230ed",
+ "metadata": {},
+ "source": [
+ "### Predicted values and residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "426c68e2-3ee9-457f-96fd-caab85547425",
+ "metadata": {},
+ "source": [
+ "The proper residuals for the 2SLS regression should be $y - Z\\hat{\\delta}$, and *not* $y - \\hat{Z}\\hat{\\delta}$, as given by the second stage OLS regression. As a consequence, the standard errors (which are based on those residuals) given in the second stage OLS regression are *not* the proper standard errors in a 2SLS estimation. In some software implementations, this is sometimes overlooked when the estimation is implemented as two actual separate stages. The correct standard errors and associated (asymptotic) t-statistics and p-values are given by the `spreg.TSLS` command."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ef05242c-8a10-4dde-a371-93e90b7685c2",
+ "metadata": {},
+ "source": [
+ "## TSLS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e4105dc9-115d-4092-99cd-d0d77da89c46",
+ "metadata": {},
+ "source": [
+ "### Immigrant paradox"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "22995b21-c8ed-4b57-849b-a9526e85c29a",
+ "metadata": {},
+ "source": [
+ "The proper standard errors for 2SLS are obtained with `spreg.TSLS`. The required arguments are `y` for the dependent variable, `x` for the exogenous variables, `yend` for the endogenous variables and `q` for the instruments. The default is to have `nonspat_diag = True`, so this is turned to `False` to focus on just the estimates and their significance. Finally, `name_ds` is included for the data set name."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "61a5e9a8-c086-4d74-9911-3c94bc1a2dff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls1 = TSLS(y,x=xe,yend=yend1,q=q1,nonspat_diag = False,name_ds=ds_name)\n",
+ "print(tsls1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eb56a0c1-9825-47e2-945b-e52391c22d7d",
+ "metadata": {},
+ "source": [
+ "The coefficient estimates are identical to the ones obtained in **ols2**, but the standard errors are slightly different. In the end, this does not affect the significance in a meaningful way. **Blk14P** remains not significant and the other p-values are only marginally affected.\n",
+ "\n",
+ "The complete set of attributes of the regression object is given with the usual `dir` command. They are essentially the same as for an OLS object, e.g., with the estimates in `betas`, predicted values in `predy` and residuals in `u`. In addition, several intermediate matrix results are included as well that are useful for customized calculations, but beyond the current scope."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1edd29b5-89db-4076-ba59-98c90f1977e1",
+ "metadata": {},
+ "source": [
+ "### Predicted values and residuals"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c514a80c-6286-4db6-b724-d32fb11e133f",
+ "metadata": {},
+ "source": [
+ "As mentioned, the residuals are $y - \\hat{y}$, with $\\hat{y}$ as the predicted values.\n",
+ "The latter are obtained as $Z \\hat{\\delta}$. This is somewhat misleading as a \n",
+ "measure of fit, since observations on the endogenous variables are included in the matrix\n",
+ "$Z$. A proper predicted value would be obtained from a solution of the reduced\n",
+ "form, with only exogenous variables on the RHS of the equation. However, in a single equation\n",
+ "cross-sectional setting, this is not possible. As computed, the predicted value thus may give an\n",
+ "overly optimistic measure of the fit of the model. Consequently, the associated residual sum of squares may be too small to reflect the correct performance of the model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ebde4bba-1085-41f1-8e33-eb5d95fd817f",
+ "metadata": {},
+ "source": [
+ "### Two endogenous variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5f3fcf58-a2f6-488d-a02b-331d8f533dbd",
+ "metadata": {},
+ "source": [
+ "To further illustrate that 2SLS also works for multiple endogenous variables, the second specification is used, now with **yend2** as the endogenous variables and **q2** as the associated instruments. For reference, the OLS results are given as well."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "97ae9e6e-f34a-4884-9950-73f19177a9d4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols3 = OLS(y,x=z2,nonspat_diag=False,name_ds=ds_name)\n",
+ "print(ols3.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6ade228c-3c80-40ca-8355-7982773f3d3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls2 = TSLS(y,x=xe,yend=yend2,q=q2,nonspat_diag=False,name_ds=ds_name)\n",
+ "print(tsls2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19bac111-50d6-4b0d-a5d6-bec6bed028a3",
+ "metadata": {},
+ "source": [
+ "The effect of correcting for potential endogeneity is again significant. In the OLS results, all coefficients except **EP_NOHSDP** are highly significant. However, in the 2SLS results, neither **Blk14P** nor **Hisp14P** are significant, but **EP_NOHSDP** becomes significant. Its coefficient also changes sign, from positive (but not significant, so essentially zero) to negative. This illustrates the importance of checking for potential endogeneity."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "32e0deb4-bc0c-4868-9600-944db1e51482",
+ "metadata": {},
+ "source": [
+ "## Durbin-Wu-Hausman Test"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3a882499-ceca-4413-8028-d7433eeceade",
+ "metadata": {},
+ "source": [
+ "The Durbin-Wu-Hausman statistic (DWH) is a test for the endogeneity of some regressors, provided instruments are available. It is a simple F-test on the joint significance of selected coefficients in an augmented regression. The augmented regression specification consists of the original exogenous variables and predicted values of the endogenous variables when regressed on the instruments (those include the exogenous variables). If the null hypothesis holds, the coefficients of these predicted values should not be significant. The DWH test then boils down to an F-test on their joint significance (see Davidson and MacKinnon 2004, p. 338-340).\n",
+ "\n",
+ "The test is included as a default diagnostic in `spreg.TSLS` (the default is `nonspat_diag=True` so this must not be included explicitly). For the two example regressions, this yields the following results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "75de9b19-a557-4d04-bb59-4b3b1780868e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls1a = TSLS(y,x=xe,yend=yend1,q=q1,name_ds=ds_name)\n",
+ "print(tsls1a.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "15e7ce06-780e-483b-bb29-05a1fc3280b6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls2a = TSLS(y,x=xe,yend=yend2,q=q2,name_ds=ds_name)\n",
+ "print(tsls2a.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80984aee-322b-495f-afd9-02fdae4de98c",
+ "metadata": {},
+ "source": [
+ "In both instances, there is clear evidence that controlling for endogeneity was warranted.\n",
+ "\n",
+ "Since the predicted value for **HIS_ct** in the first regression is already available as **yep** above, the DWH test can be verified by carrying out the auxiliary regression. To this end **z1** must be concatenated with **yep** to form the new matrix of regressors. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0f9098f0-f23e-4c72-b6a3-8450e67a45f3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "zza = pd.concat([z1,yep],axis=1)\n",
+ "ols4 = OLS(y,zza,nonspat_diag=False,name_ds=ds_name)\n",
+ "print(ols4.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b0f6ff3f-3b77-479c-b535-3648a2a77447",
+ "metadata": {},
+ "source": [
+ "The DWH statistic then follows as the result of an F-test on the coefficient of **HIS_ct_p** in the auxiliary regression."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37c27f55-c389-4dc3-8fa1-dbb180eda9a9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dwhf = f_stat(ols4,df=1)\n",
+ "print(dwhf)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "491f98c2-30b9-4e56-a8c9-0eef8a855fae",
+ "metadata": {},
+ "source": [
+ "The resulting F-statistic is exactly the value of DWH listed for **tsls1a**, with the associated p-value. In the same way, the result for the second 2SLS regression can be verified, but this is slightly more involved. In the second case, two additional regression are required to obtain predicted values for each of the endogenous variables, but otherwise everything is the same. This is all carried out under the hood for the DWH statistic."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd5699e6-1a1a-41ae-a437-aaf524b245e4",
+ "metadata": {},
+ "source": [
+ "## Robust Standard Errors"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16a9c979-d3b5-488a-b242-4b52014077a0",
+ "metadata": {},
+ "source": [
+ "As was the case for OLS regression, 2SLS allows for robust standard errors by setting the arguments `robust=\"white\"` or `robust=\"hac\"`. Everything operates in the same way as for OLS, i.e., the estimates are identical, but the standard errors, z-statistic and p-values may differ."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "420a53ae-e121-4496-a4fb-cbe9981e07ef",
+ "metadata": {},
+ "source": [
+ "### White standard errors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "04000053-cee4-4ada-befe-779bf28e2dff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls1b = TSLS(y,x=xe,yend=yend1,q=q1,name_ds=ds_name,\n",
+ " robust='white')\n",
+ "print(tsls1b.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c509296-0547-4dbc-9ef5-40531ae36e52",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls2b = TSLS(y,x=xe,yend=yend2,q=q2,name_ds=ds_name,\n",
+ " robust='white')\n",
+ "print(tsls2b.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "78e7eed4-ee84-40de-9a40-40900cbb65ce",
+ "metadata": {},
+ "source": [
+ "### HAC standard errors"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a695fbf-0877-4d90-b969-05081a75df8d",
+ "metadata": {},
+ "source": [
+ "As was the case for OLS, in addition to `robust=\"hac\"`, the kernel weights (`gwk`) and optionally their name (`name_gwk`) must be specified as well."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fd58790d-2b7f-466e-9e10-91951da1eda0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls1c = TSLS(y,x=xe,yend=yend1,q=q1,name_ds=ds_name,\n",
+ " robust='hac',gwk=wk,name_gwk=gwk_name)\n",
+ "print(tsls1c.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "438ae0cc-de8d-4051-981e-c4307f82cbd8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls2c = TSLS(y,x=xe,yend=yend2,q=q2,name_ds=ds_name,\n",
+ " robust='hac',gwk=wk,name_gwk=gwk_name)\n",
+ "print(tsls2c.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65aadbff",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "At this point, you can assess endogeneity in your own regression specification, or experiment with different models constructed from the variables in the **chicagoSDOH** sample data set. Replicate the 2SLS results by explicitly carrying out the two stage OLS regressions and/or check on the value of the Durbin-Wu-Hausman test by means of the auxiliary regression."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/7_spatial_models.ipynb.txt b/_sources/notebooks/7_spatial_models.ipynb.txt
new file mode 100644
index 00000000..b35d6887
--- /dev/null
+++ b/_sources/notebooks/7_spatial_models.ipynb.txt
@@ -0,0 +1,822 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# Spatial Model Specifications\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/07/2024)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, the basic model specifications that include spatial dependence into a linear spatial regression are introduced. This is implemented through the use of a spatially lagged variable. The spatial lag is applied to the dependent variable, as $Wy$, to the explanatory variables (excluding the constant term), as $WX$, and to the error terms, as $We$. For technical details, see the relevant chapters in Anselin and Rey (2014). *Modern Spatial Econometrics in Practice*.\n",
+ "\n",
+ "As an illustration, the effect of a spatial misspecification will be investigated. Specifically, the impact spatial effects have on the classical OLS estimates will be assessed when they are present in the data generating process (DGP), but ignored in the regression specification. To accomplish this, some simple simulation experiments are carried out.\n",
+ "\n",
+ "### Prerequisites\n",
+ "\n",
+ "Familiarity with OLS estimation in *spreg* is assumed, as covered in the *OLS notebook*. For the graphs, it may be useful to have some familiarity with *matplotlib*, although to just replicate the graphs used here, that is not really needed."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6494b68c",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The main module for spatial regression in PySAL is *spreg*. In addition, *libpysal* is needed for spatial weights manipulation, and *pandas* for data frame manipulation. In the current illustrations, *geopandas* is not needed.\n",
+ "\n",
+ "In addition, in order to plot the results of the simulation experiments, *seaborn* will be used, which in turns relies on *matplotlib* as a dependency. While under the hood, everything is actually *matplotlib* code, *seaborn* is a bit more intuitive and easier to achieve simple results. An in-depth coverage of the *seaborn* functionality is beyond the current scope, but the illustrations given here should be enough to get some decent graphs. For full details, see https://seaborn.pydata.org.\n",
+ "\n",
+ "Finally, the module *time* is imported to provide some timing results (optional)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "e398e42f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import time\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from libpysal import weights\n",
+ "from spreg import OLS, make_x, make_xb, make_wx, make_wxg, make_error, \\\n",
+ " dgp_lag, dgp_sperror, dgp_slx, dgp_spdurbin"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functions Used\n",
+ "\n",
+ "- from numpy:\n",
+ " - random.default_rng\n",
+ " - zeros\n",
+ "\n",
+ "- from pandas:\n",
+ " - DataFrame\n",
+ " - describe\n",
+ " - melt\n",
+ " \n",
+ "- from seaborn:\n",
+ " - displot\n",
+ "\n",
+ "- from matplotlib.pyplot:\n",
+ " - show\n",
+ " \n",
+ "- from libpysal:\n",
+ " - weights.lat2W\n",
+ " - w.transform\n",
+ " - w.n\n",
+ " \n",
+ "- from spreg:\n",
+ " - OLS\n",
+ " - make_x\n",
+ " - make_xb\n",
+ " - make_wx\n",
+ " - make_wxg\n",
+ " - make_error\n",
+ " - dgp_lag\n",
+ " - dgp_sperror\n",
+ " - dgp_slx\n",
+ " - dgp_spdurbin\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "67da216d",
+ "metadata": {},
+ "source": [
+ "### Files and Variables\n",
+ "\n",
+ "In this notebook, no actual data are used, but data sets will be created by means of simulation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a0f82891",
+ "metadata": {},
+ "source": [
+ "### Model Parameters\n",
+ "\n",
+ "The various model parameters are set here, so that it is easy to replicate the experiments for different sample sizes and coefficient values.\n",
+ "\n",
+ "- gridx: the number of cells in the horizontal dimension of a regular lattice of dimension gridx x gridy -- set to 20\n",
+ "- gridy: the number of cells in the vertical dimension of a regular lattice of dimension gridx x gridy -- set to 20\n",
+ "- b1: a list with regression parameters (includes a coefficient for the constant term as the first element) -- set to 1, 1\n",
+ "- rndseed: the random seed to ensure reproducibility -- set to 123456789\n",
+ "- reps: the number of replications -- set to 1000\n",
+ "- rhovals: a list with spatial autoregressive coefficients $\\rho$ for the lag variables Wy -- set to [0, 0.2, 0.5, 0.7, 0.9]\n",
+ "- lamvals: a list with spatial coefficients $\\lambda$ for the error lag variables We -- set to [0, 0.2, 0.5, 0.7, 0.9]\n",
+ "- gamvals: a list with coefficients for the SLX variable (WX) -- set to [0.1, 0.3, 0.5, 0.7, 0,9]\n",
+ "- gamma: coefficient for WX in the Spatial Durbin Model - set to 0.5\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "d1fa8d01",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gridx = 20\n",
+ "gridy = 20\n",
+ "b1 = [1, 1]\n",
+ "rndseed = 123456789\n",
+ "reps = 1000\n",
+ "rhovals = [0, 0.2, 0.5, 0.7, 0.9]\n",
+ "lamvals = [0, 0.2, 0.5, 0.7, 0.9]\n",
+ "gamvals = [0.1, 0.3, 0.5, 0.7, 0.9]\n",
+ "gamma = 0.5"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f89d7018",
+ "metadata": {},
+ "source": [
+ "### Spatial Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b6b5e514",
+ "metadata": {},
+ "source": [
+ "Row-standardized queen contiguity spatial weights are constructed for a regular lattice. The number of observations is the product of row and column dimensions of the grid. The *libpysal* command `weights.lat2W` with `rook=False` is used to obtain queen weights."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bfec5df5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "w = weights.lat2W(gridx,gridy,rook=False) \n",
+ "w.transform = 'r'\n",
+ "n = w.n\n",
+ "n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16237db8",
+ "metadata": {},
+ "source": [
+ "### X Matrices\n",
+ "\n",
+ "The X matrix is constructed using the `make_X` command from the `dgp` module in `spreg`, with the default uniform distribution. Next, $X\\beta$ is computed with `make_xb` and $WX$ with `make_wx`. This uses the X matrix (without a constant column), the spatial weights and the provided regression parameters as inputs. The resulting matrices will be used as input to create the dependent vector $y$ for specific data generating processes (DGP). For these DGP, the regression coefficients and random seed specified on top are used.\n",
+ "\n",
+ "Note that in this example `make_x` will make a one column vector, since there is only one slope coefficient. In `make_xb`, a constant column is added automatically. So, whereas a constant column is not included in **x1**, the result of `make_x`, its coefficient **must** be included in the coefficient list **b1** to compute $X \\beta$ correctly. The result is a column vector."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "4111b0a9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create X\n",
+ "rng=np.random.default_rng(seed=rndseed) # set seed for X\n",
+ "x1 = make_x(rng,n,mu=[0],varu=[6],method=\"uniform\")\n",
+ "xb1 = make_xb(x1,b1) # no constant in x1, but a coefficient for the constant in b1\n",
+ "wx1 = make_wx(x1,w) # default first order no constant"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cfb212ba",
+ "metadata": {},
+ "source": [
+ "## Spatial Lag Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "05175efb",
+ "metadata": {},
+ "source": [
+ "The spatial lag model takes on the form:\n",
+ "\\begin{equation*}\n",
+ "y = \\rho Wy + X\\beta + u,\n",
+ "\\end{equation*}\n",
+ "where $Wy$ is the spatial lag term, $\\rho$ is the spatial autoregressive coefficient, $\\beta$ are the regression coefficients, and $u$ is an error vector."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "857a3138",
+ "metadata": {},
+ "source": [
+ "The dependent variable for a spatial lag model is generated by means of `spreg.dgp_lag`. This uses the reduced form for the spatial lag model:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "y = (I - \\rho W)^{-1} X\\beta + (I - \\rho W)^{-1}u.\n",
+ "\\end{equation*}\n",
+ "\n",
+ "Interest centers on finding out what happens to the estimated coefficient $\\hat{\\beta}$ when the true DGP is the spatial lag model, but the estimation ignores the spatial lag term and uses OLS. In other words, $\\beta$ is estimated in the regression $y = X\\beta + u$. The properties of the OLS estimates in this misspecified regression are investigated by means of a small simulation.\n",
+ "\n",
+ "The simulation consists of the following steps:\n",
+ "\n",
+ "- initialize the result matrix for the estimates of $\\beta$\n",
+ "\n",
+ "- in each step of the replications, generate $y$ from the DGP and obtain $\\hat{\\beta}$ from an OLS estimation\n",
+ "\n",
+ "- run the simulation loop over all replications and spatial parameters and collect the results\n",
+ "\n",
+ "- turn the results matrix into a DataFrame\n",
+ "\n",
+ "- compute descriptive statistics\n",
+ "\n",
+ "- plot the distribution of parameter estimates for different values of the spatial parameter\n",
+ "\n",
+ "For greatest efficiency, these steps could all be combined in one big function, but to get a good sense of what is going on in each step, for now they are kept separate."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b4fb1339",
+ "metadata": {},
+ "source": [
+ "#### Initialize results matrix\n",
+ "\n",
+ "For each value of $\\rho$ as a column, the results matrix will have the $\\beta$ estimate for each replication in the row. The matrix is thus of dimension **reps** times number of $\\rho$ values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "609e63c1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "best = np.zeros((reps,len(rhovals)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "568db8d2",
+ "metadata": {},
+ "source": [
+ "#### Simulation loop"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "917baa9b",
+ "metadata": {},
+ "source": [
+ "The first step is to initialize the random seed for reproducibility. The function `spreg.make_error` is used to generate a standard normal random error vector. Then, together with the computed $X\\beta$, the error term is used to generate $y$ by means of `spreg.dgp_lag`. \n",
+ "\n",
+ "The $\\hat{\\beta}$ coefficient is extracted from the regression object as the second element in the `betas` attribute of the regression object obtained from `spreg.OLS`, and then entered in the results matrix. Running 1000 replications may take a minute or so, depending on hardware."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "970eb532",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "rng=np.random.default_rng(seed=rndseed)\n",
+ "for r in range(len(rhovals)):\n",
+ " for i in range(reps):\n",
+ " u = make_error(rng,n)\n",
+ " y = dgp_lag(u,xb1,w,rhovals[r])\n",
+ " reg = OLS(y,x1,nonspat_diag=False)\n",
+ " best[i,r] = reg.betas[1]\n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80a03c0c",
+ "metadata": {},
+ "source": [
+ "#### Results data frame"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6a1b328a",
+ "metadata": {},
+ "source": [
+ "By means of Python list comprehension, a list of meaningful column headers is created that is related to the spatial parameter values.\n",
+ "\n",
+ "The result array is then converted to a *pandas* Data Frame using this list as the column names."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cfaaaf52",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rr = [\"rho\"+str(r) for r in rhovals]\n",
+ "results = pd.DataFrame(best,columns=rr)\n",
+ "results.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a984c06e",
+ "metadata": {},
+ "source": [
+ "#### Descriptive statistics\n",
+ "\n",
+ "The default descriptive statistics are obtained by means of the `describe` method of the *pandas* data frame. Of most interest are the **mean** and the standard deviation (**std**). Any difference between the mean and the true value of $\\beta$ indicates *bias*, whereas changes in the standard deviation suggest a change in precision with values of $\\rho$."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9dc786a8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "27895f57",
+ "metadata": {},
+ "source": [
+ "The descriptive statistics for this example illustrate how the estimate for $\\beta$ becomes more and more biased with increasing values of $\\rho$, with a mean of 1.002 under the null to a mean of 1.389 for $\\rho = 0.9$. In addition, the standard deviation of the estimate increases as well, from a value of 0.0198 under the null to 0.0409 for $\\rho = 0.9$, more than double (which also means that the variance is four times as large)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "46564172",
+ "metadata": {},
+ "source": [
+ "#### Graphing the results"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d6374d21",
+ "metadata": {},
+ "source": [
+ "Using *seaborn*, it is very straightforward to plot the distribution of a column in a data frame by means of the `sns.displot` command, specifying `kind=\"kde\"`. The other arguments are the name of the data frame (here, **results**) and the x-axis, for example **rho0.5**. The command `plt.show()` is included to show the plot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f8df6de8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.displot(results, x=\"rho0.5\", kind=\"kde\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ddc26ff8",
+ "metadata": {},
+ "source": [
+ "The plot illustrates how the mean is no longer centered on the value of 1.000, but instead on 1.06."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "536eaec2",
+ "metadata": {},
+ "source": [
+ "Plotting the distribution of the estimates for all the spatial parameters on a single plot is a little trickier, and requires the use of the *pandas* `melt` functionality. As it stands, the data frame **results** is in so-called wide format, with a different column for each value of $\\rho$. *seaborn* likes this to be in so-called long format, or *tidy* (in R terminology), where all the $\\beta$ estimates form one long column with an additional variable that gives the value for $\\rho$. In a sense, each individual estimation result becomes a separate observation, with a value for $\\rho$ and a value for the $\\beta$ estimate.\n",
+ "\n",
+ "To accomplish this, two arguments need to be set, one for the new column that will contain the values of $\\rho$, `var_name`, the other for the regression coefficient that matches the replication-rho combination, named `value_name` in the *pandas* terminology. In the example, `var_name` becomes **\"rho\"**, since each of the columns contains the prefix **rho**, and `value_name` become **\"b\"**. The result is a long data frame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "98e4caed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reslong = results.melt(var_name='rho',value_name='b')\n",
+ "reslong"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "039dbdac",
+ "metadata": {},
+ "source": [
+ "At this point, the `sns.displot` command can be applied to the new data frame with `x=\"b\"` and differentiated by the value of $\\rho$ by specifying the `hue=\"rho\"`. As before, `kind=\"kde\"` for a kernel density curve. Other parameters can be set as well, but that's beyond the current scope."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf600550",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.displot(reslong,x=\"b\",hue=\"rho\",kind=\"kde\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "51bb8bfb",
+ "metadata": {},
+ "source": [
+ "The plots clearly illustrate both the increasing bias as well as the larger variance with growing values of the spatial autoregressive parameter."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6596947d",
+ "metadata": {},
+ "source": [
+ "## Spatial Error Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "390441fc",
+ "metadata": {},
+ "source": [
+ "The same exercise is now repeated for a spatial error specification, using `spreg.dgp_sperror`. The error model is:\n",
+ "\n",
+ "$y = X\\beta + u$, with $u = \\lambda Wu + e$,\n",
+ "\n",
+ "where $\\lambda$ is the spatial autoregressive coefficient for the error vector.\n",
+ "\n",
+ "The values for $\\lambda$ are specified in **lamvals** at the top of the notebook. \n",
+ "\n",
+ "The default for a spatial error vector is an autoregressive process, with `model='sar'` as the option in `spreg.dgp_sperror` (since it is the default, it doesn't have to be specified, but it is included here for clarity).\n",
+ "\n",
+ "Note how the parameter vectors have been changed to **lamvals** and the column names in the data frame have the **lam** predicate. In all other respects, the logic is the same as for the previous experiment."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "81a27ac8",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "best = np.zeros((reps,len(lamvals)))\n",
+ "rng=np.random.default_rng(seed=rndseed)\n",
+ "for r in range(len(lamvals)):\n",
+ " for i in range(reps):\n",
+ " u = make_error(rng,n)\n",
+ " y = dgp_sperror(u,xb1,w,lamvals[r],model=\"sar\")\n",
+ " reg = OLS(y,x1,nonspat_diag=False)\n",
+ " best[i,r] = reg.betas[1]\n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)\n",
+ "rr = [\"lam\"+str(r) for r in lamvals]\n",
+ "results = pd.DataFrame(best,columns=rr)\n",
+ "results.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e5357938",
+ "metadata": {},
+ "source": [
+ "In contrast to the spatial lag model, the estimates for $\\beta$ remain unbiased, with the mean centered on the correct value of 1.0. However, as the spatial parameter increases, the standard error goes from 0.0198 under the null to 0.0409 for $\\lambda = 0.9$, four times as much.\n",
+ "\n",
+ "A graph can be created for all values of $\\lambda$ in the same way as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d8f95820",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reslong = results.melt(var_name='lam',value_name='b')\n",
+ "sns.displot(reslong,x=\"b\",hue=\"lam\",kind=\"kde\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2f425024",
+ "metadata": {},
+ "source": [
+ "The pattern is confirmed by the graphs, with a lower curve corresponding to a larger variance. Note how for $\\lambda = 0.2$, the effect is negligible, with even a slightly smaller standard error. The impact becomes really pronounced for larger values of $\\lambda$."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41b77f6a",
+ "metadata": {},
+ "source": [
+ "### Moving Average Errors"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29078f0a",
+ "metadata": {},
+ "source": [
+ "For comparison, the simulations are also run for a spatial moving average error process. In this model, the error vector is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "u = \\lambda We + e,\n",
+ "\\end{equation*}\n",
+ "with $e$ as a standard normal error vector.\n",
+ "\n",
+ "The commands to carry out the simulation are all the same, except that in `spreg.dgp_sperror`, the `model` argument should be set to `\"ma\"`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8f425e2c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "best = np.zeros((reps,len(lamvals)))\n",
+ "rng=np.random.default_rng(seed=rndseed)\n",
+ "for r in range(len(lamvals)):\n",
+ " for i in range(reps):\n",
+ " u = make_error(rng,n)\n",
+ " y = dgp_sperror(u,xb1,w,lamvals[r],model=\"ma\")\n",
+ " reg = OLS(y,x1,nonspat_diag=False)\n",
+ " best[i,r] = reg.betas[1]\n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)\n",
+ "rr = [\"lam\"+str(r) for r in lamvals]\n",
+ "results = pd.DataFrame(best,columns=rr)\n",
+ "results.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "83958659",
+ "metadata": {},
+ "source": [
+ "Here, the effect on the variance is much less pronounced, going from 0.020 under the null to 0.022 with $\\rho = 0.9$. The same can be observed in the frequency graphs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "67c29fe0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reslong = results.melt(var_name='lam',value_name='b')\n",
+ "sns.displot(reslong,x=\"b\",hue=\"lam\",kind=\"kde\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e020c365",
+ "metadata": {},
+ "source": [
+ "## SLX"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b7ac083e",
+ "metadata": {},
+ "source": [
+ "The same approach is followed to assess the effect of ignoring a spatially lagged explanatory variable (SLX) on the regression slope coefficient. Formally, the SLX specification is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "y = X \\beta + WX \\gamma + u,\n",
+ "\\end{equation*}\n",
+ "\n",
+ "where $WX$ does not contain a constant term (the spatial lag of a constant term is the same constant, which would create perfect multicollinearity) and $\\gamma$ is a vector of parameters.\n",
+ "\n",
+ "The DGP for the SLX model is obtained from `spreg.dgp_slx`. Both **xb1** and **wxg1** need to be passed as arguments. The latter is the product $WX \\gamma$, which must be computed for each different value of $\\gamma$.\n",
+ "\n",
+ "To investigate a range of values for $\\gamma$, some slight adjustments to the code are needed. In the simulation loop, **wxg1** must be updated for each new value of $\\gamma$. Otherwise, the logic remains the same as before."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f5441e84",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "best = np.zeros((reps,len(gamvals)))\n",
+ "rng=np.random.default_rng(seed=rndseed)\n",
+ "for r in range(len(gamvals)):\n",
+ " wxg1 = make_wxg(wx1,r)\n",
+ " for i in range(reps):\n",
+ " u = make_error(rng,n)\n",
+ " y = dgp_slx(u,xb1,wxg1)\n",
+ " reg = OLS(y,x1,nonspat_diag=False)\n",
+ " best[i,r] = reg.betas[1]\n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)\n",
+ "rr = [\"gam\"+str(r) for r in gamvals]\n",
+ "results = pd.DataFrame(best,columns=rr)\n",
+ "results.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "03987a3d",
+ "metadata": {},
+ "source": [
+ "This result illustrates the classic *omitted variable bias* effect, since the ignored WX is nothing but an omitted regressor. The effect is a slight increase in bias of the $\\beta$ estimate, that becomes larger with larger $\\gamma$. The effect on the variance is minimal."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a9a7cb61",
+ "metadata": {},
+ "source": [
+ "This behavior is nicely illustrated by the frequency plots, which show a gradual shift away from the value of 1.0, but in contrast to what happens for the spatial lag model, all the curves have basically the same shape, indicating a minimal effect on the variance."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ed484563",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reslong = results.melt(var_name='gam',value_name='b')\n",
+ "sns.displot(reslong,x=\"b\",hue=\"gam\",kind=\"kde\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7a14f9b6",
+ "metadata": {},
+ "source": [
+ "## Spatial Durbin"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "33f966f7",
+ "metadata": {},
+ "source": [
+ "The final model illustrated here is a Spatial Durbin model, which includes both a spatially lagged dependent variable, $Wy$, and spatially lagged explanatory variables (SLX), $WX$:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "y = Wy + X\\beta + WX \\gamma + u.\n",
+ "\\end{equation*}\n",
+ "\n",
+ "The dependent variable is generated by means of the reduced form, similar to what is the case for a spatial lag model, but now including the SLX term as part of the regression:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "y = (I - \\rho W)^{-1} (X\\beta + WX\\gamma) + (I - \\rho W)^{-1} u.\n",
+ "\\end{equation*}\n",
+ "\n",
+ "\n",
+ "The effect of a misspecified spatial Durbin model is illustrated for a range of spatial autoregressive coefficients. For the sake of simplicity, $\\lambda$ is kept fixed, but of course, there could be a double loop over values of both $\\rho$ and $\\lambda$. The relevant function is `spreg.dgp_spdurbin`. It takes as arguments and error term, **xb**, **wxg**, the spatial weights **w**, and the spatial parameter $\\rho$ ($\\gamma$ is used in the calculation of **wxg**, outside of the loop)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9f4a38b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t0 = time.time()\n",
+ "rng=np.random.default_rng(seed=rndseed)\n",
+ "wxg1 = dgp.make_wxg(wx1,gamma)\n",
+ "for r in range(len(rhovals)):\n",
+ " for i in range(reps):\n",
+ " u = make_error(rng,n)\n",
+ " y = dgp_spdurbin(u,xb1,wxg1,w,rhovals[r])\n",
+ " reg = OLS(y,x1,nonspat_diag=False)\n",
+ " best[i,r] = reg.betas[1]\n",
+ "t1 = time.time()\n",
+ "print(\"time in minutes: \",(t1-t0)/60.0)\n",
+ "rr = [\"rho\"+str(r) for r in rhovals]\n",
+ "results = pd.DataFrame(best,columns=rr)\n",
+ "results.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "306c0dd1",
+ "metadata": {},
+ "source": [
+ "The effect on the bias of the regression coefficient is similar but greater than for the simple spatial lag model, but the standard error changes in very much the same way, essentially doubling over the range of $\\rho$.\n",
+ "\n",
+ "This is nicely illustrated by the series of frequency curves, with the right-most curve centered on 1.6."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1ff89fbf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reslong = results.melt(var_name='rho',value_name='b')\n",
+ "sns.displot(reslong,x=\"b\",hue=\"rho\",kind=\"kde\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "495f233d",
+ "metadata": {},
+ "source": [
+ "## Other Options"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7801863f",
+ "metadata": {},
+ "source": [
+ "The *dgp* module contains several other options, such as:\n",
+ "\n",
+ "- SLX error model with SAR and MA errors: `spreg.dgp_slxerror`\n",
+ "\n",
+ "- SAR-Error model: `spreg.dgp_lagerr`\n",
+ "\n",
+ "- SARMA model: `spreg.dgp_lagerr` with `model=\"ma\"`\n",
+ "\n",
+ "- GNS SAR model: `spreg.dgp_gns`\n",
+ "\n",
+ "- GNS MA model: `spreg.dgp_gns` with `model=\"ma\"`\n",
+ "\n",
+ "In addition, the random errors can be generated for different variance values, and besides the default normal distribution, for a `laplace`, `cauchy` or `lognormal` distribution by specifying the argument `method`.\n",
+ "\n",
+ "Finally, the X matrix can be generated with a uniform distribution (the default), but also for independent normal vectors (`method = 'normal'`) and bivariate correlated vectors (`method = 'bivnormal'`). While the latter can only be implemented for two explanatory variables, the number of regressors for the other options is not constrained."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65aadbff",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "Assess the properties of OLS for different forms of spatial misspecifications. For example, you could examine the effect of negative spatial coefficients, or of multiple spatially lagged regressors, as well as the misspecification caused by the models that were not included in the examples, such as SAR-Error and SLX-Error. You could also experiment with different sample sizes and different spatial weights, especially weights with different connectedness characteristics.\n",
+ "\n",
+ "For now, the treatment of estimation has been limited to classic OLS. Before venturing into the estimation of the spatial models, it is important to have a good sense of the implications of ignoring spatial effects and to what extent they matter.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/8_spatial_multipliers.ipynb.txt b/_sources/notebooks/8_spatial_multipliers.ipynb.txt
new file mode 100644
index 00000000..d7c7d513
--- /dev/null
+++ b/_sources/notebooks/8_spatial_multipliers.ipynb.txt
@@ -0,0 +1,833 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Spatial Multipliers\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/17/2024)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Preliminaries"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This notebook takes a closer look at the spatial pattern induced by spatial lag terms in a regression specification. The usual notion of *average* impacts is disassembled into its individual components, with a particular focus on its spatial distribution. Three broad types of models are considered: a linear SLX model, a model with a spatially lagged dependent variable (Wy), and nonlinear SLX models."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Prerequisites"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This notebook includes material that relies more on familiarity with Python than the previous notebooks, specifically the manipulation of pandas data frames and geopandas geo data frames, as well as the interpretation of the helper functions. In addition, spatial weights functionality and choropleth mapping are used extensively. Both were covered in previous notebooks. Two specialized routines are included from `spreg`, i.e., `i_multipliers` (to compute the individual multipliers) and `make_wnslx` (to create the lag operator in nonlinear SLX models). It is also assumed that the sample data set **Police** is installed (if not, execute `libpysal.examples.load_example(\"Police\")` first, with the module `libpysal.examples` imported)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Modules Needed"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The usual imports include numpy, pandas, geopandas and matplotlib (for mapping). More specialized imports consist of `examples.get_path` and `weights` from `libpysal`, as well as `i_multipliers` (from `spreg.sputils`) and `make_wnslx` (from `spreg.utils`). To allow the complete set of observations (n=82) to be listed in the notebook, the `pd.options.display.max_rows` should be set to a value larger than 82. Also, as usual, numpy `set_printoptions` should be set to `legacy=\"1.25\"`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings \n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os \n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from libpysal.examples import get_path\n",
+ "import libpysal.weights as weights\n",
+ "from spreg.sputils import i_multipliers\n",
+ "from spreg.utils import make_wnslx\n",
+ "np.set_printoptions(legacy=\"1.25\")\n",
+ "pd.options.display.max_rows = 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Functionality Used"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "- from numpy: \n",
+ " - array\n",
+ "\n",
+ "- from pandas/geopandas:\n",
+ " - idxmax, idxmin\n",
+ " - iloc\n",
+ " - plot\n",
+ " - concat\n",
+ " - read_file\n",
+ " - centroid\n",
+ " - describe\n",
+ " \n",
+ "- from libpysal.examples:\n",
+ " - get_path\n",
+ "\n",
+ "- from libpysal.weights:\n",
+ " - Queen.from_dataframe\n",
+ " - neighbors\n",
+ " - cardinalities\n",
+ " - KNN.from_dataframe\n",
+ " - Kernel.from_dataframe\n",
+ "\n",
+ "- from spreg:\n",
+ " - sputils.i_multipliers\n",
+ " - utils.make_wnslx"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Helper Functions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Since many operations will be repeated almost verbatim for the different types of weights and multipliers, two helper functions are used. One, `multmap`, simplifies the mapping, with (very) limited customization. If additional customization is desired, it must be made in the function itself. The function is essentially a wrapper around the commands to make a quantile choropleth map for a geo data frame.\n",
+ "\n",
+ "The second function, `nbreffect`, finds the minimum and maximum locations for a given type of multiplier effect and then calls `multmap` to create the associated map."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def multmap(dfm,column='EonNbrs',model='slx'):\n",
+ " \"\"\"\n",
+ " Creates a quintile map (k=5) for the multipliers computed by means\n",
+ " of i_multiplier\n",
+ "\n",
+ " Arguments\n",
+ " ---------\n",
+ " dfm : merged geo data frame with original data and spatial multiplier\n",
+ " data frame\n",
+ " column : type of multiplier to be mapped, default is EonNbrs\n",
+ " model : spatial model, default is slx (i.e., only weights matrix)\n",
+ "\n",
+ " Returns\n",
+ " -------\n",
+ " Draws map\n",
+ "\n",
+ " \"\"\"\n",
+ " ax = dfm.plot(\n",
+ " column = column,\n",
+ " scheme = 'Quantiles',\n",
+ " k = 5,\n",
+ " cmap = 'YlOrRd',\n",
+ " edgecolor = \"Black\",\n",
+ " linewidth = 0.2,\n",
+ " figsize = (6,6),\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": column}\n",
+ " )\n",
+ " newtitle = \"Spatial Multipliers for \" + model\n",
+ " ax.set_title(newtitle)\n",
+ " ax.set_axis_off()\n",
+ " plt.show()\n",
+ " return\n",
+ "\n",
+ "\n",
+ "def nbreffect(dfs,dfm,oid,mult='EonNbrs',model='slx'):\n",
+ " \"\"\"\n",
+ " Computes the minimum and maximum value and location for a given type of multiplier\n",
+ " and calls multmap to create a quintile map\n",
+ "\n",
+ " Arguments\n",
+ " ---------\n",
+ " dfs : initial geo data frame\n",
+ " dfm : data frame with multipliers\n",
+ " oid : pandas series with ID variable\n",
+ " multi : type of multiplier, default is EonNbrs\n",
+ " model : spatial model, default is slx\n",
+ "\n",
+ " \"\"\"\n",
+ " if mult == 'Direct':\n",
+ " coli = 1\n",
+ " elif mult == 'EofNbrs':\n",
+ " coli = 2\n",
+ " elif mult == 'EonNbrs':\n",
+ " coli = 3\n",
+ " else:\n",
+ " raise Exception(\"Invalid column\")\n",
+ " imax = int(dfm[[mult]].idxmax())\n",
+ " print(f\"The maximum is {dfm.iloc[imax,coli]:0.3f} with id {int(oid.iloc[imax])}\")\n",
+ " imin = int(dfm[[mult]].idxmin())\n",
+ " print(f\"The minimum is {dfm.iloc[imin,coli]:0.3f} with id {int(oid.iloc[imin])}\")\n",
+ " dfmap = pd.concat([dfs,dfm],axis=1)\n",
+ " multmap(dfmap,column=mult,model=model)\n",
+ " return\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data Input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The different multipliers are illustrated with the **Police** sample data set. A geo data frame is created which is used in the remainder to construct the spatial weights. The only input is the shape file:\n",
+ "\n",
+ "- police.shp (shx,dbf): police expenditure data for 82 Mississippi counties\n",
+ "\n",
+ "The variable `FIPSNO` is taken as the id variable and turned into a pandas Series `fipsid`. The variable itself is set to `idvar`. This makes it easy to use the same code for other data sets (replace infileshp, fipsid and idvar by the appropriate file name/variable name)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = \"police.shp\" # input shape file\n",
+ "inpath = get_path(infileshp)\n",
+ "dfs = gpd.read_file(inpath)\n",
+ "print(dfs.columns)\n",
+ "fipsid = dfs[[\"FIPSNO\"]]\n",
+ "idvar = \"FIPSNO\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Coordinates"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The construction of the sparse weights for the nonlinear SLX power and exponential functions requires a numpy array of X and Y coordinates as input. The **Police** data set does not include those variables explicitly, but they can be computed as the centroids of the polygons, using the `centroid` attribute of the geopandas data frame.\n",
+ "\n",
+ "However, the `make_wnslx` function needs the coordinates as a numpy array, whereas the result of the `centroid` attribute is a geopandas geoseries with `POINT` geometries. This is turned into a numpy array by extracting the x and y coordinates and passing these to `np.array`. As it turns out, the result must be transposed, since as is, it becomes a 2 by 82 matrix and not the desired 82 by 2 matrix.\n",
+ "\n",
+ "Note that the coordinates are decimal degrees (longitude-latitude), which will require the argument `distance_metric = \"Arc\"` in the `make_wnslx` function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gcent = dfs.centroid\n",
+ "print(type(gcent))\n",
+ "print(gcent.head())\n",
+ "coords = np.array((gcent.x,gcent.y)).T\n",
+ "print(coords.shape)\n",
+ "coords[0:5,:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Spatial Weights Characteristics"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The simplest calculations are for the linear SLX model, where the spatial spillovers are limited to the neighbors as specified in the spatial weights matrix. With row-standardization, the effect **of** the neighbors always sums to one, so by constructions it is the same (=1) for all observations. Similarly, because the spatial weights have zero on the diagonal, there is no direct effect other than that already captured by coefficients of the X-variables. However, because the row-standardization introduces an asymmetry in the weights (but not in the contiguity structure), the effect **on** the neighbors of a change in X in a given location is **not constant** across all observations. This effect is the sum of the column elements associated with each observation. Whereas the mean of these sums is the same as the mean of the row sums and equals one, there remains considerable variation among the contributions of changes in a variable at a location on its neighbors. This is masked by using average effects.\n",
+ "\n",
+ "The following three examples show the spatial distribution of the individual multipliers for queen contiguity, k-nearest neighbors and kernel weights. Queen contiguity is intrinsically symmetric, but becomes asymmetric after row-standardization. K-nearest neighbor weights tend not to be symmetric to begin with (k-nearest neighbor is not a symmetric relationship). Kernel weights that are based on a fixed distance band are symmetric. Since they are not row-standardized, the effect **of** the neighbors and effect **on** the neighbors is the same. However, when kernel weights are based on an adaptive bandwidth (such as k-nearest neighbors), they will be intrinsically asymmetric and the row and column elements for an observation will differ. In all cases, the average masks sometimes substantial spatial variation among observations."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Queen Contiguity Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As covered in a previous notebook, queen contiguity weights are constructed from the geo data frame by means of `weights.Queen.from_dataframe`, here using `ids=idvar` for the id-variable (which is `\"FIPSNO\"` in this notebook). The weights are row-standardized and then the individual multipliers are calculated using `i_multipliers` with `model=\"slx\"` and `id=fipsid` (the pandas Series with FIPSNO created above). There is no need to specify a spatial coefficient since the default is `coef=0.0`. Some descriptive statistics are provided by `describe()`.\n",
+ "\n",
+ "The first column gives the ID-variable that was used and is not very meaningful. The second column describes the **Direct** effects, which are zero by construction. The last two columns pertain to the **effect of neighbors** (row sum, **EofNbrs**) and the **effect on neighbors** (column sum, **EonNbrs**)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wq = weights.Queen.from_dataframe(dfs,ids=idvar)\n",
+ "wq.transform = 'r'\n",
+ "queen = i_multipliers(wq,model='slx',id=fipsid)\n",
+ "queen.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As expected, the **mean** of both effect of neighbors and effect on neighbors is the same and equals one. However, there is quite a bit of variation in the effect **on** neighbors, with a range from 0.560 to 1.667 and a standard deviation of 0.25. The full range of variability can be seen by printing out the complete data frame (to see all the rows, the pandas `options.display.max_rows` must be set to more than 82)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(queen)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The position and ID of the largest and smallest effects on neighbors can be found by means of `idxmax` and `idxmin` applied to the `EonNbrs` data series. This yields the county with FIPS 28131 (Stone) for the maximum and the county with FIPS 28045 (Hancock) for the minimum. The spatial variation (and location of minimum and maximum) is directly related to the graph-theoretic structure of the spatial weights."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imax = int(queen[['EonNbrs']].idxmax())\n",
+ "print(f'The maximum is {queen.iloc[imax,3]:0.3f} with id {int(fipsid.iloc[imax])}')\n",
+ "imin = int(queen[['EonNbrs']].idxmin())\n",
+ "print(f'The minimum is {queen.iloc[imin,3]:0.3f} with id {int(fipsid.iloc[imin])}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To get a better insight into the effect of the asymmetry introduced by the row-standardization, the following lines of code extract the list of neighbors for county 28131, then checks their respective cardinalities from `wq.cardinalities` and computes the cumulative sum of the corresponding weights (the inverse of the cardinalities). Because many of the neighbors have themselves (much) fewer than seven neighbors, their contribution to the cumulative sum is more than 1/7, yielding an overall multiplier effect that can become larger than one. The opposite case is where the neighbors have more neighbors themselves, yielding a spatial weight less than 1/7 which contributes to a multiplier effect that can be smaller than one.\n",
+ "\n",
+ "As an exercise, repeat this analysis for the county with the minimum multiplier of 0.560 by entering its fips code in `wq.neighbors` (county 28045 - this county only has three neighbors)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nbrs = wq.neighbors[28131]\n",
+ "print(\"neighbor list \",nbrs)\n",
+ "cards = wq.cardinalities\n",
+ "j = 0\n",
+ "for i in nbrs:\n",
+ " u = cards[i]\n",
+ " print(f\"for {i}, the number of neighbors is {u}, with weight {1.0/u:0.3f}\")\n",
+ " j += 1.0/u\n",
+ "print(f\"total effect on neighbors: {j:0.3f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The spatial distribution of the multiplier effects can be mapped. This functionality is wrapped in the functions listed at the top of the notebook, but first each step is spelled out in detail here. To begin, `pd.concat` is used to add the new dataframe to the existing spatial data frame. Because this operation will be repeated for each type of multiplier, the merged data frame is given a new name (otherwise, `dfs` would be accumulating all the multiplier effecs that have the same name). The multipliers are visualized in a quintile map (k=5) using a common set of customizations as covered in the mapping notebook.\n",
+ "\n",
+ "The resulting map reveals considerable variation in the spatial pattern. From a policy perspective, the darkest counties are the ones with the most impact on their neighbors and thus they could be targeted for maximum effect place-based policies."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfq = pd.concat([dfs,queen],axis=1)\n",
+ "ax = dfq.plot(\n",
+ " column = 'EonNbrs',\n",
+ " scheme = 'Quantiles',\n",
+ " k = 5,\n",
+ " cmap = 'YlOrRd',\n",
+ " edgecolor = \"Black\",\n",
+ " linewidth = 0.2,\n",
+ " figsize = (6,6),\n",
+ " legend = True,\n",
+ " legend_kwds={\"loc\":\"center left\",\"bbox_to_anchor\":(1,0.5), \"title\": 'EonNbrs'}\n",
+ ")\n",
+ "ax.set_title(\"Spatial Multipliers\")\n",
+ "ax.set_axis_off()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Since the mapping will be repeated for each multiplier, the various settings are encapsulated in the `multmap` helper function listed at the top of the notebook. As a result, the map can now be obtained with a single line command."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "multmap(dfq)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### K-Nearest Neighbor Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "K-nearest neighbor weights are constructed from the geo data frame by means of `weights.KNN.from_dataframe`. Since the median number of neighbors for queen contiguity was 5, k is set to 5 as well, to keep some degree of comparability. The multipliers are again obtained with `i_multipliers` passing the row-standardized weights, the model (`slx`) and the identifiers (`idvar`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wk = weights.KNN.from_dataframe(dfs,k=5,ids=idvar)\n",
+ "wk.transform = 'r'\n",
+ "knn = i_multipliers(wk,model='slx',id=fipsid)\n",
+ "knn.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Again, by construction, the direct effects are zero and the effect of the neighbors is 1, due to row-standardization. However, there is considerable variation in the effect **on** neighbors, again due to the asymmetry of the weights. The multipliers range from 0.4 to 1.8, a wider range than for queen contiguity.\n",
+ "\n",
+ "Further insight into the spatial pattern of the multipliers is obtained by means of the helper function `nbreffect`, which combines identifying maximum and miminum with the mapping of `multmap`. The arguments are the original data frame, `dfs`, the multiplier data frame for k-nearest neighbor weights, `knn`, the id variable, `fipsid`, the type of multiplier, `EonNbrs`, and the model, `slx`. This gives all the desired results with a one line command."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nbreffect(dfs,knn,fipsid,mult='EonNbrs',model='slx')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The spatial pattern shows quite a few differences with that for the queen weights, highlighting the importance of the structure of the spatial weights. The maximum location is now county 28081 (Lee) and the minimum is county 28157 (Wilkinson). More specialized map comparison techniques (such as a co-location map) can be used to compare the two patterns, but this is not further pursued here.\n",
+ "\n",
+ "Note how the quantile map for knn only shows four categories, even though k was set to 5. This is due to the large number of ties, which results in the two bottom categories being collapsed. This is a common problem for quantile maps when there is insufficient variation in the variable considered (here, all multiples of 0.2)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Kernel Weights"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Kernel weights are illustrated for a `triangular` function (the default) computed for a variable bandwidth (`fixed=False`) determined by the 10 k-nearest neighbors. With `diagonal=True`, the diagonal elements all equal one. The function `i_multiplier` is called with the kernel weights (`wkern`), `model=\"kernel\"` and the same id (`fipsid`) as before.\n",
+ "\n",
+ "The result data frame is `kern`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wkern = weights.Kernel.from_dataframe(dfs,k=10,ids=idvar,fixed=False,diagonal=True)\n",
+ "kern = i_multipliers(wkern,model='kernel',id=fipsid)\n",
+ "kern.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The direct effects are all equal to one by construction. Both the effect of the neighbors (row sum - without the diagonal) and effect on the neighbors (column sum - without the diagonal) show substantial variation, even though their means are the same. The effect of the neighbors ranges from 1.789 to 3.833, whereas the effect on the neighbors goes from 1.562 to 3.890, a slightly larger range.\n",
+ "\n",
+ "The location of the extrema and the map for the individual multipliers can again be obtained by means of `nbreffect`. Since this now has to be done for `EofNbrs` as well as `EonNbrs`, both are put in a list and the results are produced by a simple loop, minimizing code repetition."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "effects = ['EofNbrs','EonNbrs']\n",
+ "for eff in effects:\n",
+ " nbreffect(dfs,kern,fipsid,mult=eff,model='kernel')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The resulting respective maximum and minimum locations are quite distinct, as are the associated spatial patterns. A closer examination of the spatial (mis)match is left as an exercise."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Spatial Lag"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The spatial spillovers in a model that includes a spatially lagged dependent variable Wy are determined by the structure of the inverse matrix $(I - \\rho W)^{-1}$. This is illustrated here for the queen contiguity weights. Similar results can be obtained for the k-nearest neighbor weights.\n",
+ "\n",
+ "The `i_multipliers` function needs as arguments the spatial weights (`wq`), a value for the spatial autoregressive coefficient (`coef=0.5`), the model type (`lag`) and the `id`. Descriptive statistics are provided by `describe`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag5 = i_multipliers(wq,coef=0.5,model='lag',id=fipsid)\n",
+ "lag5.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The mean of **Direct**, 1.066, is what is typically referred to as **ADI** or average direct impact, whereas the mean of either **EofNbrs** or **EonNbrs**, 0.934, is the **AII** or average indirect impact. Since $\\rho = 0.5$, the average total impact (**ATI**) is $1.0 / (1.0 - \\rho)$, or 2.0. Clearly this equals the sum of 1.066 and 0.934.\n",
+ "\n",
+ "The averages mask considerable individual variation. The range of variation is fairly small for the direct effects - from 1.046 to 1.096 - and the effect of neighbors - from 0.904 to 0.954 -, but is quite substantial for the effect on neighbors - from 0.532 to 1.482.\n",
+ "\n",
+ "Details are again provided by means of the `nbreffects` function, now with `model = \"lag\"`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "effects = ['Direct','EofNbrs','EonNbrs']\n",
+ "for eff in effects:\n",
+ " nbreffect(dfs,lag5,fipsid,mult=eff,model='lag')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The spatial pattern of the direct effects and effects from neighbors are the exact opposite. This reflects how the total effect is split between these two parts: larger direct effects imply smaller indirect effects and vice versa. On the other hand, the spatial pattern for the effect on neighbors is quite distinct. It shows great similarity (but is not identical) to the spatial pattern for the queen weights as such (SLX)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Interestingly, the magnitude of the spatial autoregressive coefficient, while it affects the mean impact measures, does not affect the spatial pattern of the multipliers, which remains exactly the same. This is because the spatial pattern is determined by the network structure in the weights and the coefficient is just a scaling factor.\n",
+ "\n",
+ "This is illustrated by using 0.3 as the spatial autoregressive coefficient."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lag3 = i_multipliers(wq,coef=0.3,model='lag',id=fipsid)\n",
+ "lag3.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "effects = ['Direct','EofNbrs','EonNbrs']\n",
+ "for eff in effects:\n",
+ " nbreffect(dfs,lag3,fipsid,mult=eff,model='lag')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Negative Exponential Distance Function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The multiplier effects for a nonlinear SLX model can be computed in the same way as before, from the diagonals, row sums and column sums of the parameterized weights matrix $W(\\alpha)$. Two cases are considered: a negative exponential distance transformation and an inverse power distance transformation. The input for the estimation of this model, in `spreg.NSLX`, is a sparse CSR matrix of distance measures that are standardized with respect to the bandwidth for each observation (fixed or variable). This ensures a well-behaved distance metric and a distance decay process.\n",
+ "\n",
+ "For the negative exponential transformation in the nonlinear SLX model, the transformation is $e^{-\\alpha z_{ij}}$, where $z_{ij} = d_{ij} / d_{bw}$ for $d_{ij} \\le d_{bw}$, and 0 otherwise, with $d_{bw}$ as the bandwidth. \n",
+ "\n",
+ "The input is a sparse CSR matrix, constructed by means of `make_wnslx` with the parameter `exponential`. As input it takes a numpy array of coordinates (see above), a tuple of parameters setting the number of nearest neighbors (here 10), whether the bandwidth is adaptive (`np.inf`) or fixed (a value), and the type of transformation (`exponential`). Since the coordinates are lat-lon decimal degrees, `distance_metric = \"Arc\"` to compute great circle distances. The result is a sparse CSR matrix. Its contents can be shown by means of `toarray`, which turns it into a regular (full) numpy array. This is just a view of the array and does not affect its sparse representation.\n",
+ "\n",
+ "Since the transformed distance input is the fraction of the bandwidth, it results in smaller values closer to the origin and a value of 1.0 for the farthest nearest neighbor. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wexp = make_wnslx(coords,params=(10,np.inf,\"exponential\"),leafsize=30,distance_metric='Arc')\n",
+ "print(type(wexp))\n",
+ "wexp.toarray()[0:1,:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Some descriptive statistics and the spatial pattern are obtained in the same way as before, using `i_multipliers` and `nbreffect`. The parameter is set to `coef = 2.0` and the model is `exponential`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "exp2 = i_multipliers(wexp,coef=2.0,model='exponential',id=fipsid)\n",
+ "exp2.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The mean multiplier of 2.648 ranges from 2.117 to 3.315 for the effect of neighbors, and from 1.368 to 3.871 for the effect on neighbors. The spatial distribution of the multipliers is again distinct."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "effects = ['EofNbrs','EonNbrs']\n",
+ "for eff in effects:\n",
+ " nbreffect(dfs,exp2,fipsid,mult=eff,model='exponential')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Again, a different value of the coefficient affects the mean multiplier, but the spatial distribution remains the same. For example, with `coef = 1.5`, the mean multiplier becomes 3.636, reflecting the less steep distance decay that results from the smaller parameter.\n",
+ "\n",
+ "This is important for the interpretation of the model. Unlike the other spatial models, a larger value of the coefficient in the nonlinear SLX models results in a *smaller multiplier*."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "exp15 = i_multipliers(wexp,coef=1.5,model='exponential',id=fipsid)\n",
+ "exp15.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "effects = ['EofNbrs','EonNbrs']\n",
+ "for eff in effects:\n",
+ " nbreffect(dfs,exp15,fipsid,mult=eff,model='exponential')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Inverse Distance Power Function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For the inverse distance power function, the transformation is $1.0 / d_{ij}^{\\alpha}$. This is implemented as $z_{ij}^{\\alpha}$, where $z_{ij}$ is guaranteed to be less than one, as $z_{ij} = 1 - d_{ij}/d_{bw}$ for $d_{ij} \\le d_{bw}$, and zero otherwise. Hence, the distance measure for this transformation corresponds to a triangular kernel for the given bandwidth, with the diagonals zeroed out. The fraction of the bandwidth decreases as the distance increases, eventually yielding zero for the value of k. \n",
+ "\n",
+ "Again, the corresponding weights are computed by means of the `make_wnslx` function (from `spreg.utils`). As input it takes a numpy array of coordinates (see above), a tuple of parameters setting the number of nearest neighbors (here 10), whether the bandwidth is variable (`np.inf`) or fixed (a value), and the type of transformation (`power`). Since the coordinates are lat-lon decimal degrees, `distance_metric = \"Arc\"` to compute great circle distances. The result is a sparse CSR matrix. Its contents can be shown by means of `toarray`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "winv = make_wnslx(coords,params=(10,np.inf,\"power\"),distance_metric='Arc')\n",
+ "print(type(winv))\n",
+ "winv.toarray()[0:1,:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The sparse array `winv` is the spatial weights argument to the function `i_multipliers`. A coefficient of 2.0 is used as `coef` and the model is specified as `power`. The id is the same as before. Some summary characteristics are shown by means of `describe`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pow2 = i_multipliers(winv,coef=2.0,model='power',id=fipsid)\n",
+ "pow2.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The mean multiplier effect is 1.279, but it ranges from 0.628 to 2.124 for the effect of neighbors and from 0.709 to 1.917 for the effect on neighbors. Again, the mean masks considerable spatial variation. The direct effects are zero by construction.\n",
+ "\n",
+ "The spatial pattern of the individual multipliers can be investigated in the same way as for the other multipliers by means of the `nbreffects` function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "effects = ['EofNbrs','EonNbrs']\n",
+ "for eff in effects:\n",
+ " nbreffect(dfs,pow2,fipsid,mult=eff,model='power')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Again, the spatial pattern of multipliers is not affected by the spatial parameter, but solely determined by the graph structure implied by the spatial weights before applying the transformation. This is illustrated with `coef = 1.5`. The mean effect of 1.902 is different and larger than for `coef=2.0`. This is due to the slower distance decay with the smaller coefficient, which gives larger weight to neighbors further away. The spatial pattern is unchanged."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pow15 = i_multipliers(winv,coef=1.5,model='power',id=fipsid)\n",
+ "pow15.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "effects = ['EofNbrs','EonNbrs']\n",
+ "for eff in effects:\n",
+ " nbreffect(dfs,pow15,fipsid,mult=eff,model='power')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "The similarities and differences between the different spatial layouts can be investigated more closely. In addition, the correlation between the multiplier vectors, respective locations of extrema and other characteristics can be examined. Of course, the same type of analysis can be investigated for other spatial weights and/or another data set."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "py312",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/_sources/notebooks/9_specification_tests.ipynb.txt b/_sources/notebooks/9_specification_tests.ipynb.txt
new file mode 100644
index 00000000..f1ea659b
--- /dev/null
+++ b/_sources/notebooks/9_specification_tests.ipynb.txt
@@ -0,0 +1,594 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b8975c4",
+ "metadata": {},
+ "source": [
+ "# Specification Tests\n",
+ "\n",
+ "### Luc Anselin\n",
+ "\n",
+ "### (revised 09/11/2024)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cfd0985",
+ "metadata": {},
+ "source": [
+ "## Preliminaries\n",
+ "\n",
+ "In this notebook, the basic regression diagnostics for spatial autocorrelation are introduced. These include the classic Moran's I test, as well as the Lagrange Multiplier/Rao Score tests for lag and error dependence developed during the 1980s and 1990s. In addition, the recent tests for the Spatial Durbin specification developed by Koley and Bera are covered as well. The tests are detailed in Anselin and Rey (2014), *Modern Spatial Econometrics in Practice* and in Anselin, Serenini and Amaral (2024). *Spatial Econometric Model Specification Search: Another Look* (DOI: 10.13140/RG.2.2.10650.86721), as well as in the references therein.\n",
+ "\n",
+ "In addition to the classic case with OLS estimates, a test for spatial correlation is covered for models with endogenous variables, estimated by means of 2SLS.\n",
+ "\n",
+ "### Prerequisites\n",
+ "\n",
+ "Familiarity with OLS and 2SLS estimation in *spreg* is assumed, as covered in the respective notebooks, as well as basics of *numpy*, *pandas*, *geopandas*, and *libpysal*. In addition, it is assumed that the **chicagoSDOH** PySAL sample data set has been installed (for specific instructions, refer to the *sample data notebook*)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d8a50738",
+ "metadata": {},
+ "source": [
+ "### Modules Needed\n",
+ "\n",
+ "The main module for spatial regression in PySAL is *spreg*. In addition *libpysal* is needed for data import and spatial weights manipulation, and *geopandas* for data input from a shape file. This notebook is based on version 1.7 of *spreg*. \n",
+ "\n",
+ "As before, only the needed functions from *libpysal* are imported, i.e., `libpysal.io.open` as `open`, `libpysal.examples.get_path` as `get_path`, and `libpysal.weights` as `weights`. The `OLS` and `TSLS` estimation routines are imported from `spreg`.\n",
+ "\n",
+ "Some additional imports are included to avoid excessive warning messages. With later versions of PySAL, these may not be needed. As before, the `set_printoptions` is used for *numpy* 2.0 and later."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "e398e42f",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "import os\n",
+ "os.environ['USE_PYGEOS'] = '0'\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import geopandas as gpd\n",
+ "from libpysal.io import open\n",
+ "from libpysal.examples import get_path\n",
+ "import libpysal.weights as weights\n",
+ "from spreg import OLS, TSLS\n",
+ "np.set_printoptions(legacy=\"1.25\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ac85fb3",
+ "metadata": {},
+ "source": [
+ "### Functionality Used\n",
+ "\n",
+ "- from geopandas:\n",
+ " - read_file\n",
+ " \n",
+ "- from libpysal:\n",
+ " - examples.get_path\n",
+ " - io.open\n",
+ " - weights.transform\n",
+ " \n",
+ "- from spreg:\n",
+ " - OLS\n",
+ " - TSLS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b34442e2",
+ "metadata": {},
+ "source": [
+ "### Data, Weights and Variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "effd05a4",
+ "metadata": {},
+ "source": [
+ "As in the previous notebooks, all data sets, weights files and variables are specified at the top, so that they can be easily changed to other examples.\n",
+ "\n",
+ "Date sets and weights are from the **chicagoSDOH** sample data set:\n",
+ "\n",
+ "- **Chi-SDOH.shp,shx,dbf,prj**: socio-economic indicators of health for 2014 in 791 Chicago tracts\n",
+ "- **Chi-SDOH_q.gal**: queen contiguity spatial weights created with *GeoDa*\n",
+ "\n",
+ "The weights are used in row-standardized form.\n",
+ "\n",
+ "For the OLS case, the same model specification is used as before, with **YPLL_rate** (an index measuring premature mortality, i.e., higher values are worse health outcomes) as the dependent variable, and **HIS_ct** (economic hardship index), **Blk14P** (percent Black population), and **Hisp14P** \n",
+ "(percent Hispanic population) as the explanatory variables. These are specified in the **y_name** etc. variables, which are then used to create the corresponding *numpy* vectors and matrices for use as **y**, **x**, etc. in the regression specification.\n",
+ "\n",
+ "For the 2SLS case, the variable **HIS_ct** is considered to be endogenous, with **COORD_X** and **COORD_Y** (the tract centroids) as instruments, as in the 2SLS notebook.\n",
+ "\n",
+ "The various initializations are carried out in two steps:\n",
+ "\n",
+ "- first, all file names and variable names are defined\n",
+ "- second, the files are read and variable vectors/matrices constructed\n",
+ "\n",
+ "The first step allows for customization to other examples, the second step is agnostic to the actual files and variables that were specified. To keep the code simple, there are no error checks for missing files or mismatches in the variable names."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "829cca11",
+ "metadata": {},
+ "source": [
+ "#### Specify file and variable names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "fc2dba41",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "infileshp = get_path(\"Chi-SDOH.shp\") # input shape file with data\n",
+ "infileq = get_path(\"Chi-SDOH_q.gal\") # queen contiguity weights from GeoDa\n",
+ "y_name = 'YPLL_rate'\n",
+ "x_names1 = ['Blk14P','Hisp14P']\n",
+ "x_names2 = ['Blk14P','Hisp14P','HIS_ct']\n",
+ "yend_names = ['HIS_ct']\n",
+ "q_names = ['COORD_X', 'COORD_Y']\n",
+ "ds_name = 'Chi-SDOH'\n",
+ "w_name = 'Chi-SDOH_q'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "91648058",
+ "metadata": {},
+ "source": [
+ "#### Read files and extract variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4103013d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dfs = gpd.read_file(infileshp)\n",
+ "wq = open(infileq).read() # queen contiguity weights\n",
+ "wq.transform = 'r' # row-transform the weights\n",
+ "y = dfs[y_name]\n",
+ "x1 = dfs[x_names1]\n",
+ "x2 = dfs[x_names2]\n",
+ "yend = dfs[yend_names]\n",
+ "q = dfs[q_names]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e7765db8",
+ "metadata": {},
+ "source": [
+ "## Specification Tests\n",
+ "\n",
+ "The specification tests for spatial autocorrelation in regression residuals are invoked by setting the argument `spat_diag = True` in the OLS call and passing a spatial weights object. This will result in the LM tests being added to the result output. In order to also list Moran's I, the argument `moran = True` is required as well. Since this test statistic involves more computation, it is not included in the default `spat_diag` listing.\n",
+ "\n",
+ "Before considering the implementation of these tests in `spreg`, their formal expressions are briefly summarized."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3acd8851",
+ "metadata": {},
+ "source": [
+ "### Moran's I"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65fbe0bd",
+ "metadata": {},
+ "source": [
+ "Formally,the Moran's I test statistic is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "I = \\frac{e'We / S_0}{e'e / n}.\n",
+ "\\end{equation*}\n",
+ "\n",
+ "In this expression, $S_0 = \\sum_i \\sum_j w_{ij}$ is the sum of the weights in matrix $W$,\n",
+ "and $e$ are the residuals.\n",
+ "\n",
+ "Inference for Moran's I is based on an asymptotic standard normal approximation.\n",
+ "The statistic itself is first converted into a standardized or z-value by subtracting\n",
+ "the mean and dividing by the standard deviation. Those two moments are obtained\n",
+ "under the null hypothesis of no spatial autocorrelation. \n",
+ "\n",
+ "The moments of Moran's I for regression residuals,\n",
+ " under the null of no spatial autocorrelation (for the\n",
+ "regression error terms) were derived by Cliff and Ord (1972) (Testing for spatial autocorrelation among regression residuals, *Geographical Analysis* 4, 267-284).\n",
+ "\n",
+ "The mean is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "E[ I ] = \\frac{tr(MW)}{(n - k)},\n",
+ "\\end{equation*}\n",
+ "\n",
+ "where the matrix $M$ in the trace expression follows the conventional\n",
+ "notation as $M = I - X(X'X)^{-1}X'$, a $n \\times n$ projection matrix.\n",
+ "\n",
+ "The variance of Moran's I is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "Var[I] = \\frac{ tr(MWMW') + tr(MWMW) + (tr(MW))^2}\n",
+ " {(n - k)(n - k + 2)} - ( E[I])^2.\n",
+ "\\end{equation*}\n",
+ "\n",
+ "The so-called z-value is then obtained in the usual fashion, as:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "I_z = \\frac{I - E[I]}{\\sqrt{Var[I]}} \\sim \\ N(0, 1).\n",
+ "\\end{equation*}\n",
+ "\n",
+ "The $I_z$ statistic has an asymptotic distribution that is approximated by \n",
+ "the standard normal.\n",
+ "\n",
+ "Moran's I is a very powerful misspecification test. While designed to detect spatial error autocorrelation, it also has power against a range of other misspecifications, including heteroskedasticity and non-normality. Hence, when the null is *not* rejected, one can be fairly confident that none of these misspecifications are present. On the other hand, when the null is rejected, it is not always clear what the next step should be, other than maybe implement HAC standard errors. In contrast, the LM statistics are so-called focused tests against spatial autocorrelation that consider a specific alternative (lag or error)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "87d52c95",
+ "metadata": {},
+ "source": [
+ "### LM Statistics for Spatial Lag and Spatial Error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e60bd9c0",
+ "metadata": {},
+ "source": [
+ "Technical details pertaining to these tests are given in Chapter 5 of Anselin and Rey (2014).\n",
+ "The original results are from Burridge (1980) (On the Cliff-Ord test for spatial autocorrelation, *Journal of the Royal Statistical Society B*, 42, 107-108) for the LM-Error test, Anselin (1988) (Lagrange Multiplier test diagnostics for spatial dependence and spatial heterogeneity, *Geographical Analysis* 20, 1-17), for the LM-Lag test, and Anselin, Bera, Florax, Yoon (1996) (Simple diagnostic tests for spatial dependence, *Regional Science and Urban Economics* 26, 77-104) for their robust forms.\n",
+ "\n",
+ "The LM test for spatial lag is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "LM_{\\rho} = \\frac{d_{\\rho}^2}{D} \\sim \\chi^2(1),\n",
+ "\\end{equation*}\n",
+ "\n",
+ "with $d_{\\rho} = e'Wy / \\hat{\\sigma}^2$ as the score for $\\rho$, and\n",
+ "$D = (WX\\hat{\\beta})' M (WX \\hat{\\beta}) / \\hat{\\sigma}^2 + T$, where\n",
+ "$M = I - X(X'X)^{-1}X'$, $T = tr(WW + W'W)$ (with tr as the trace of\n",
+ "a matrix), $e$ is the OLS residual vector and $\\hat{\\beta}$ are the OLS\n",
+ "regression coefficients (so, $X \\hat{\\beta}$ is the vector of predicted values).\n",
+ "\n",
+ "The LM test for spatial error is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "LM_{\\lambda} = \\frac{d_{\\lambda}^2}{T} \\sim \\chi^2(1),\n",
+ "\\end{equation*}\n",
+ "\n",
+ "where $d_{\\lambda} = (e'We) /\\hat{\\sigma}^2$ is the score for $\\lambda$ and $T$\n",
+ "is as before.\n",
+ "\n",
+ "The LM test for lag robust to the presence of error is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "LM_{\\rho}^* = \\frac{(d_{\\rho} - d_{\\lambda} )^2}{( D - T)} \\sim \\chi^2(1),\n",
+ "\\end{equation*}\n",
+ "\n",
+ "The LM test for error robust to the presence of lag is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "LM_{\\lambda}^* = \\frac{(d_{\\lambda} - TD^{-1}d_{\\rho} )^2}{[T (1 - TD) ] } \\sim \\chi^2(1),\n",
+ "\\end{equation*}\n",
+ "\n",
+ "in the same notation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "48984604",
+ "metadata": {},
+ "source": [
+ "### LM Statistics for Spatial Durbin Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "64cadb7c",
+ "metadata": {},
+ "source": [
+ "In Koley and Bera (2024) (To use or not to use the spatial Durbin model? - that is the question. *Spatial Economic Analysis* 19, 30-56), robust\n",
+ "LM tests are derived for $\\rho$ and $\\gamma$ in the spatial Durbin model. Again, the point of departure is\n",
+ "an OLS regression of the classic non-spatial specification. \n",
+ "\n",
+ "LM test for $\\gamma$:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "LM_{\\gamma} = \\frac{(e'(WX_0))[(WX_0)'M(WX_0)]^{-1}((WX_0)'e)}{\\hat{\\sigma}^2} \\sim \\chi^2(h),\n",
+ "\\end{equation*}\n",
+ "\n",
+ "where $h = k-1$ (i.e., not counting the constant term), and $X_0$ is the $X$ matrix without the constant column. The expression for the $LM_{\\rho}$ test is the same as before.\n",
+ "\n",
+ "Also, the joint LM test on $\\rho$ and $\\gamma$ is:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "LM_{\\rho\\gamma} = \n",
+ "\\begin{bmatrix}\n",
+ "(Wy)'e / \\hat{\\sigma}^2 & e'(WX_0) / \\hat{\\sigma}^2\n",
+ "\\end{bmatrix}\n",
+ "\\begin{bmatrix}\n",
+ "(WX\\beta)'M(WX\\hat{\\beta}) + T & (WX\\hat{\\beta})'M(WX_0) \\\\\n",
+ "(WX_0)'M(WX\\hat{\\beta}) & (WX_0)'M(WX_0)\n",
+ "\\end{bmatrix}^{-1}\n",
+ "\\begin{bmatrix}\n",
+ "(Wy)'e / \\hat{\\sigma}^2 \\\\\n",
+ "(WX_0)'e / \\hat{\\sigma}^2\n",
+ "\\end{bmatrix} \\sim \\chi^2(k)\n",
+ "\\end{equation*}\n",
+ "\n",
+ "In these expressions, $T$ and\n",
+ "$M$ are as before.\n",
+ "\n",
+ "The robust forms of the $LM_{\\rho}$ and $LM_{\\gamma}$ tests can be obtained from the result of the joint test and the expressions for $LM_{\\rho}$ and\n",
+ "$LM_{\\gamma}$, since the following equality holds:\n",
+ "\n",
+ "\\begin{equation*}\n",
+ "LM_{\\rho\\gamma} = LM_{\\rho} + LM_{\\gamma}^* = LM_{\\rho}^* + LM_{\\gamma}.\n",
+ "\\end{equation*}\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c2894211",
+ "metadata": {},
+ "source": [
+ "### Diagnostics in a 2SLS Regression"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e41f826f",
+ "metadata": {},
+ "source": [
+ "As in the classic OLS estimation, the residuals from 2SLS\n",
+ "estimation can be assessed for spatial autocorrelation. Specifically, the so-called $AK$\n",
+ "test extends the principle behind\n",
+ " the Moran's I statistic to residuals from a 2SLS estimation (Anselin and Kelejian 1997. Testing for spatial error autocorrelation in the presence of endogenous regressors. *International Regional Science Review* 20, 153-182).\n",
+ "\n",
+ "The\n",
+ "statistic reduces to the same expression as the Lagrange Multiplier test for \n",
+ "error spatial autocorrelation, but using the residuals\n",
+ "from the 2SLS regression. Formally:\n",
+ "\\begin{equation*}\n",
+ "AK = \\frac{[ (e'We)/e'e/n) ]^2}{tr(WW + W'W)} \\sim \\chi^2(1)\n",
+ "\\end{equation*}\n",
+ "where $e$ is a vector of 2SLS residuals, $W$ is the spatial weights matrix, and and $tr$\n",
+ "stands for a matrix trace expression. The statistic is distributed asymptotically as \n",
+ "Chi-squared with one degree of freedom.\n",
+ "\n",
+ "Some caution is needed in the interpretation of the results of the $AK$ test. \n",
+ "Even though it takes the form of an $LM$ test, the statistic\n",
+ "is really a generalization \n",
+ "of Moran's I and therefore not actually a Lagrange Multiplier test. Since estimation\n",
+ "is based on 2SLS, there is no assumption of normality and thus also no likelihood\n",
+ "function (on which the $LM$ statistic is based). Therefore, \n",
+ " the \n",
+ "$AK$ test needs to be interpreted as a diffuse test rather than as a focused\n",
+ "test (e.g., in the standard $LM$ case). In other words, the\n",
+ "rejection of the null of no spatial autocorrelation does not point to either a lag or an error\n",
+ "specification as the proper alternative. As is the case for Moran's I in a classic regression, rejection\n",
+ "of the null points to the absence of independence, but not to a particular specification\n",
+ "that may be the reason for the spatial correlation.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "edf9be3d",
+ "metadata": {},
+ "source": [
+ "## Spatial Diagnostics in OLS Regression"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5ed4d91c",
+ "metadata": {},
+ "source": [
+ "To obtain the spatial diagnostics, the argument `spat_diag = True` must be set, with, in addition, `moran = True` if Moran's I is desired. Also, a spatial weights matrix must be specified as the `w` argument (optionally with its name in `name_w`).\n",
+ "\n",
+ "#### Two explanatory variables\n",
+ "\n",
+ "First, this is illustrated for the immigrant paradox regression with just two explanatory variables, i.e., with the arguments **y** and **x1**. The call to `OLS` is the same as in the earlier notebook, but now with the arguments for the spatial diagnostics included."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e00b184",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols1 = OLS(y,x1,w=wq,spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(ols1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "58b8b9e5",
+ "metadata": {},
+ "source": [
+ "The regression output is augmented with a section at the bottom, entitled DIAGNOSTICS FOR SPATIAL DEPENDENCE. It is organized into two parts, one dealing the SAR-Error model as the alternative specification, the other with the Spatial Durbin specification (SDM).\n",
+ "\n",
+ "In the SAR-Error part, the first result (if `moran=True`) is for Moran's I. The value of the Moran's I is listed under MI/DF, here 0.1271, with the associated z-value under VALUE, as 6.521. Finally, in the PROB column, the associated p-value is given. In this example, Moran's I is highly significant, suggesting a misspecification problem. However, since the test is not focused on a specific alternative, it is not clear what the next step would be. Also, there is very strong evidence of heteroskedasticity, against which Moran's I has power as well.\n",
+ "\n",
+ "The next set of tests are the LM tests and their associated robust forms, first for Lag and then for Error. The LM-Lag test (26.211) is highly significant, but its robust form (0.258) is not. In contrast, both LM-Error (39.569) and the associated robust form (13.615) are highly significant, suggesting an Error alternative. Finally, the joint test for Lag and Error is highly significant as well. This test is not always indicative of the need for a higher order alternative, since it has high power against the single alternatives as well.\n",
+ "\n",
+ "The upshot of these statistics is strong evidence towards a spatial error alternative.\n",
+ "\n",
+ "The final set of tests are the LM tests in a Spatial Durbin context. The LM test for the coefficients of WX is given first, with its robust counterpart. The degrees of freedom (DF, 2) match the number of explanatory variables (not counting the constant term). It this example, the LM test of 0.836 is not significant, but its robust counterpart of 14.193 is (robust to the presence of a spatial lag). The LM test for Lag has the same value as in the SAR-Error context (26.211), but its robust form (39.569) is now robust to the presence of an SLX term and is highly significant. The joint test for Spatial Durbin (40.404) has degrees of freedom equal to the number of explanatory variables + 1, or 3 in this case. It is highly significant.\n",
+ "\n",
+ "Koley and Bera (2024) suggest the following interpretation of these results. First, consider whether the joint test is significant, which it clearly is. Then consider each of the robust forms of the test statistic, which are also both highly significant in this example. This would point to a Spatial Durbin alternative. However, there may be some other misspecifications going on here, since the values of the robust tests are *larger* than their original counterparts, which is not standard behavior. In a strict sense, the robust tests are robust to *small* departures from the null and they should be *smaller* than the original test, since they *correct* for the ignored misspecification. When this inequality does not hold, there is an indication that the misspecification is more major than the correction is able to accommodate.\n",
+ "\n",
+ "Clearly, the recommendations given by the SAR-Error and the Spatial Durbin contexts are at odds. This is further explored in the notebook on specification searches."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab470abe",
+ "metadata": {},
+ "source": [
+ "#### Three explanatory variables\n",
+ "\n",
+ "With the hardship indicator (**HIS_ct**) included in the regression, the call uses **x2**, but is otherwise identical to the previous one."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9e356306",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ols2 = OLS(y,x2,w=wq,spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(ols2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e91fcee3",
+ "metadata": {},
+ "source": [
+ "The inclusion of the hardship indication in the regression specification changes not only the interpretation of the regression coefficients, but also greatly affects the results for the spatial diagnostics. In the SAR-Error context, the LM-Lag statistic is still significant, but its robust form is not. There is now only the weakest of evidence for the Error case (5.705 with a p-value of 0.02). Neither of the robust tests are significant. The joint LM test (7.329) only achieves a p-value of 0.0256, which provides very weak evidence. On the other hand, Moran's I remains significant, but much less so than before (z-value of 2.619 with p=0.0088).\n",
+ "\n",
+ "The situation is completely different on the Spatial Durbin front. Now, the values for the robust forms of the tests are smaller than the original counterparts, as they should be. There is strong evidence for an SLX alternative. Following the Koley-Bera (2024) recommendations, interest focuses on the robust forms of the one-directional tests, since the joint test is highly significant. The robust Lag test (5.705) is no longer significant for a p-value of 0.01 (p=0169), whereas the robust WX test (17.742) is still highly significant (p=0.0005). This would suggest an SLX alternative, or possibly, with a lower standard for significance (e.g., p=0.05), a Spatial Durbin model.\n",
+ "\n",
+ "Since it is straightforward to estimate an SLX model by means of OLS, the effect of this specification on the spatial diagnostics is examined next. A full consideration of the estimation of various SLX models is considered in a separate notebook."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f76a7ef2",
+ "metadata": {},
+ "source": [
+ "## Spatial Diagnostics in SLX Regression"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "057a2703",
+ "metadata": {},
+ "source": [
+ "The call for the estimation of an SLX specification is the same as for any other OLS estimation, except that an additional argument `slx_lags=1` must be included. If higher order lags are desired, the argument to `slx_lags` must be adjusted accordingly.\n",
+ "\n",
+ "The results for the full regression (with **x2**) is:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e2c6636d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "slx = OLS(y,x2,w=wq,slx_lags=1,\n",
+ " spat_diag=True,moran=True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(slx.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9e76d1a1",
+ "metadata": {},
+ "source": [
+ "The inclusion of the three SLX terms changes the estimates and significance of the other coefficients considerably. The **Hisp14P** variable is no longer significant, but its spatial lag is, with a large negative coefficient. All three SLX coefficients are significant. However, their magnitude and sign raise some concerns. It is very difficult to interpret a case where the coefficient of the neighbor's influence has a different sign from the original coefficient, as is the case for **Blk14P** and **W_Blk14P**. This suggests negative spatial autocorrelation, which is rare, but not impossible, although it runs counter to Tobler's first law of geography.\n",
+ "\n",
+ "Similarly, obtaining a coefficient for WX that is larger than the corresponding coefficient for X suggests a stronger effect of the neighbors than for the location itself, which runs counter to the distance decay implied by Tobler's law. While this does not make the SLX specification invalid as such, it does require a careful consideration of the interpretation of the coefficients.\n",
+ "\n",
+ "Diagnostics are only reported for the SAR-Error alternatives, since the WX term is already included in the model. This inclusion seems to have eliminated most evidence for remaining spatial autocorrelation. While Moran's I and the LM tests are weakly significant (but *not* for a p-value of p=0.01), the robust LM tests are not, and neither is the joint LM test.\n",
+ "\n",
+ "This would suggest that the inclusion of the SLX terms has taken care of the spatial autocorrelation *problem*. However, as mentioned, the signs and magnitudes of the coefficients are difficult to interpret and would demand closer scrutiny."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7471653e",
+ "metadata": {},
+ "source": [
+ "## Spatial Diagnostics in 2SLS Regression"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9bb8b6b3",
+ "metadata": {},
+ "source": [
+ "The AK test is included in the output of a 2SLS regression when `spat_diag=True` and spatial weights are specified. In all other respects, the call is the same as reviewed in a previous notebook, with **x1** as the exogenous explanatory variables, **yend** as the endogenous variable and **q** as the instruments."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e90e6a7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tsls1 = TSLS(y,x=x1,yend=yend,q=q,w=wq,spat_diag = True,\n",
+ " name_w=w_name,name_ds=ds_name)\n",
+ "print(tsls1.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f55944e8",
+ "metadata": {},
+ "source": [
+ "In contrast to what was found with Moran's I for the OLS case, there is no evidence for residual spatial autocorrelation when the variable **HIS_CT** is treated as being endogenous."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "65aadbff",
+ "metadata": {},
+ "source": [
+ "## Practice\n",
+ "\n",
+ "At this point, it would most effective if you could continue with your baseline regression, assess whether there is any evidence of spatial autocorrelation and what type of alternative suggests itself. If warranted, check the effect of including SLX terms or correcting for endogeneity on the regression coefficients and spatial diagnostics. Consider using a number of different spatial weights to assess the robustness of your findings."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "py312",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/_sources/notebooks/GM_Lag_example.ipynb.txt b/_sources/notebooks/GM_Lag_example.ipynb.txt
new file mode 100644
index 00000000..ada225b7
--- /dev/null
+++ b/_sources/notebooks/GM_Lag_example.ipynb.txt
@@ -0,0 +1,511 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---------------------------------------\n",
+ "\n",
+ "# Spatial 2SLS\n",
+ "\n",
+ "* **This notebook contains the PySAL/spreg code for Chapter 7 - Spatial 2SLS**\n",
+ " * *in: Modern Spatial Econometrics in Practice: A Guide to GeoDa, GeoDaSpace and PySAL.*\n",
+ " * *by: Luc Anselin and Sergio J. Rey.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:48.619651Z",
+ "start_time": "2021-01-03T16:26:48.586688Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Last updated: 2021-01-03T11:26:48.607797-05:00\n",
+ "\n",
+ "Python implementation: CPython\n",
+ "Python version : 3.8.6\n",
+ "IPython version : 7.19.0\n",
+ "\n",
+ "Compiler : Clang 11.0.0 \n",
+ "OS : Darwin\n",
+ "Release : 20.2.0\n",
+ "Machine : x86_64\n",
+ "Processor : i386\n",
+ "CPU cores : 8\n",
+ "Architecture: 64bit\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "%load_ext watermark\n",
+ "%watermark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:49.970828Z",
+ "start_time": "2021-01-03T16:26:48.621823Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Watermark: 2.1.0\n",
+ "\n",
+ "spreg : 1.2.0.post1\n",
+ "libpysal: 4.3.0\n",
+ "numpy : 1.19.4\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy\n",
+ "import libpysal\n",
+ "import spreg\n",
+ "\n",
+ "%watermark -w\n",
+ "%watermark -iv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:49.976860Z",
+ "start_time": "2021-01-03T16:26:49.973373Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "baltim\n",
+ "======\n",
+ "\n",
+ "Baltimore house sales prices and hedonics, 1978. \n",
+ "----------------------------------------------------------------\n",
+ "\n",
+ "* baltim.dbf: attribute data. (k=17)\n",
+ "* baltim.shp: Point shapefile. (n=211)\n",
+ "* baltim.shx: spatial index.\n",
+ "* baltim.tri.k12.kwt: kernel weights using a triangular kernel with 12 nearest neighbors in KWT format.\n",
+ "* baltim_k4.gwt: nearest neighbor weights (4nn) in GWT format.\n",
+ "* baltim_q.gal: queen contiguity weights in GAL format.\n",
+ "* baltimore.geojson: spatial weights in geojson format.\n",
+ "\n",
+ "Source: Dubin, Robin A. (1992). Spatial autocorrelation and neighborhood quality. Regional Science and Urban Economics 22(3), 433-452.\n"
+ ]
+ }
+ ],
+ "source": [
+ "libpysal.examples.explain(\"baltim\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:49.991200Z",
+ "start_time": "2021-01-03T16:26:49.979155Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Read Baltimore data\n",
+ "db = libpysal.io.open(libpysal.examples.get_path(\"baltim.dbf\"), \"r\")\n",
+ "ds_name = \"baltim.dbf\"\n",
+ "\n",
+ "# Read dependent variable\n",
+ "y_name = \"PRICE\"\n",
+ "y = numpy.array(db.by_col(y_name)).T\n",
+ "y = y[:, numpy.newaxis]\n",
+ "\n",
+ "# Read exogenous variables\n",
+ "x_names = [\"NROOM\", \"NBATH\", \"PATIO\", \"FIREPL\", \"AC\", \"GAR\", \"AGE\", \"LOTSZ\", \"SQFT\"]\n",
+ "x = numpy.array([db.by_col(var) for var in x_names]).T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:50.062604Z",
+ "start_time": "2021-01-03T16:26:49.993233Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Read spatial data\n",
+ "ww = libpysal.io.open(libpysal.examples.get_path(\"baltim_q.gal\"))\n",
+ "w = ww.read()\n",
+ "ww.close()\n",
+ "w_name = \"baltim_q.gal\"\n",
+ "w.transform = \"r\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Basic Spatial 2SLS\n",
+ "\n",
+ "The model to estimate is:\n",
+ "\n",
+ "$$ y = \\rho Wy + X \\beta + \\epsilon $$\n",
+ "\n",
+ "where you use $WX$ as instruments of $Wy$."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:50.074860Z",
+ "start_time": "2021-01-03T16:26:50.065491Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "REGRESSION\n",
+ "----------\n",
+ "SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES\n",
+ "--------------------------------------------------\n",
+ "Data set : baltim\n",
+ "Weights matrix : baltim_q\n",
+ "Dependent Variable : PRICE Number of Observations: 211\n",
+ "Mean dependent var : 44.3072 Number of Variables : 11\n",
+ "S.D. dependent var : 23.6061 Degrees of Freedom : 200\n",
+ "Pseudo R-squared : 0.7278\n",
+ "Spatial Pseudo R-squared: 0.6928\n",
+ "\n",
+ "------------------------------------------------------------------------------------\n",
+ " Variable Coefficient Std.Error z-Statistic Probability\n",
+ "------------------------------------------------------------------------------------\n",
+ " CONSTANT -2.5762742 5.5210355 -0.4666288 0.6407655\n",
+ " NROOM 0.9440746 1.0609697 0.8898224 0.3735612\n",
+ " NBATH 5.5981348 1.7376725 3.2216283 0.0012746\n",
+ " PATIO 5.8424768 2.7435166 2.1295577 0.0332081\n",
+ " FIREPL 6.4579185 2.4238370 2.6643369 0.0077140\n",
+ " AC 5.4871926 2.3450930 2.3398614 0.0192909\n",
+ " GAR 4.3565951 1.6955478 2.5694321 0.0101865\n",
+ " AGE -0.0730060 0.0523546 -1.3944510 0.1631814\n",
+ " LOTSZ 0.0579765 0.0149534 3.8771359 0.0001057\n",
+ " SQFT 0.0395330 0.1638055 0.2413409 0.8092909\n",
+ " W_PRICE 0.5823313 0.0721387 8.0723813 0.0000000\n",
+ "------------------------------------------------------------------------------------\n",
+ "Instrumented: W_PRICE\n",
+ "Instruments: W_AC, W_AGE, W_FIREPL, W_GAR, W_LOTSZ, W_NBATH, W_NROOM,\n",
+ " W_PATIO, W_SQFT\n",
+ "================================ END OF REPORT =====================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "model = spreg.GM_Lag(\n",
+ " y,\n",
+ " x,\n",
+ " w=w,\n",
+ " name_y=y_name,\n",
+ " name_x=x_names,\n",
+ " name_w=\"baltim_q\",\n",
+ " name_ds=\"baltim\"\n",
+ ")\n",
+ "print(model.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Second order spatial lags\n",
+ "\n",
+ "You can also use $[WX, W^2X]$ as instruments of $Wy$."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:50.085089Z",
+ "start_time": "2021-01-03T16:26:50.076795Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "REGRESSION\n",
+ "----------\n",
+ "SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES\n",
+ "--------------------------------------------------\n",
+ "Data set : baltim\n",
+ "Weights matrix : baltim_q\n",
+ "Dependent Variable : PRICE Number of Observations: 211\n",
+ "Mean dependent var : 44.3072 Number of Variables : 11\n",
+ "S.D. dependent var : 23.6061 Degrees of Freedom : 200\n",
+ "Pseudo R-squared : 0.7276\n",
+ "Spatial Pseudo R-squared: 0.6915\n",
+ "\n",
+ "------------------------------------------------------------------------------------\n",
+ " Variable Coefficient Std.Error z-Statistic Probability\n",
+ "------------------------------------------------------------------------------------\n",
+ " CONSTANT -2.8867580 5.4563344 -0.5290654 0.5967601\n",
+ " NROOM 0.9527428 1.0613055 0.8977083 0.3693411\n",
+ " NBATH 5.5975309 1.7386698 3.2194330 0.0012844\n",
+ " PATIO 5.7884989 2.7409866 2.1118304 0.0347010\n",
+ " FIREPL 6.4012808 2.4201110 2.6450360 0.0081682\n",
+ " AC 5.4587589 2.3451078 2.3277220 0.0199269\n",
+ " GAR 4.3440361 1.6961624 2.5610969 0.0104342\n",
+ " AGE -0.0713188 0.0521742 -1.3669353 0.1716456\n",
+ " LOTSZ 0.0575329 0.0149111 3.8583943 0.0001141\n",
+ " SQFT 0.0377524 0.1638248 0.2304438 0.8177470\n",
+ " W_PRICE 0.5893267 0.0695101 8.4782886 0.0000000\n",
+ "------------------------------------------------------------------------------------\n",
+ "Instrumented: W_PRICE\n",
+ "Instruments: W2_AC, W2_AGE, W2_FIREPL, W2_GAR, W2_LOTSZ, W2_NBATH, W2_NROOM,\n",
+ " W2_PATIO, W2_SQFT, W_AC, W_AGE, W_FIREPL, W_GAR, W_LOTSZ,\n",
+ " W_NBATH, W_NROOM, W_PATIO, W_SQFT\n",
+ "================================ END OF REPORT =====================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "# using second order spatial lags for the instruments, set w_lags = 2\n",
+ "model2 = spreg.GM_Lag(\n",
+ " y,\n",
+ " x,\n",
+ " w=w,\n",
+ " w_lags=2,\n",
+ " name_y=y_name,\n",
+ " name_x=x_names,\n",
+ " name_w=\"baltim_q\",\n",
+ " name_ds=\"baltim\"\n",
+ ")\n",
+ "print(model2.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Spatial Diagnostics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:50.099577Z",
+ "start_time": "2021-01-03T16:26:50.088864Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "REGRESSION\n",
+ "----------\n",
+ "SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES\n",
+ "--------------------------------------------------\n",
+ "Data set : baltim\n",
+ "Weights matrix : baltim_q\n",
+ "Dependent Variable : PRICE Number of Observations: 211\n",
+ "Mean dependent var : 44.3072 Number of Variables : 11\n",
+ "S.D. dependent var : 23.6061 Degrees of Freedom : 200\n",
+ "Pseudo R-squared : 0.7278\n",
+ "Spatial Pseudo R-squared: 0.6928\n",
+ "\n",
+ "------------------------------------------------------------------------------------\n",
+ " Variable Coefficient Std.Error z-Statistic Probability\n",
+ "------------------------------------------------------------------------------------\n",
+ " CONSTANT -2.5762742 5.5210355 -0.4666288 0.6407655\n",
+ " NROOM 0.9440746 1.0609697 0.8898224 0.3735612\n",
+ " NBATH 5.5981348 1.7376725 3.2216283 0.0012746\n",
+ " PATIO 5.8424768 2.7435166 2.1295577 0.0332081\n",
+ " FIREPL 6.4579185 2.4238370 2.6643369 0.0077140\n",
+ " AC 5.4871926 2.3450930 2.3398614 0.0192909\n",
+ " GAR 4.3565951 1.6955478 2.5694321 0.0101865\n",
+ " AGE -0.0730060 0.0523546 -1.3944510 0.1631814\n",
+ " LOTSZ 0.0579765 0.0149534 3.8771359 0.0001057\n",
+ " SQFT 0.0395330 0.1638055 0.2413409 0.8092909\n",
+ " W_PRICE 0.5823313 0.0721387 8.0723813 0.0000000\n",
+ "------------------------------------------------------------------------------------\n",
+ "Instrumented: W_PRICE\n",
+ "Instruments: W_AC, W_AGE, W_FIREPL, W_GAR, W_LOTSZ, W_NBATH, W_NROOM,\n",
+ " W_PATIO, W_SQFT\n",
+ "\n",
+ "DIAGNOSTICS FOR SPATIAL DEPENDENCE\n",
+ "TEST MI/DF VALUE PROB\n",
+ "Anselin-Kelejian Test 1 5.234 0.0221\n",
+ "================================ END OF REPORT =====================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "model = spreg.GM_Lag(\n",
+ " y,\n",
+ " x,\n",
+ " w=w,\n",
+ " spat_diag=True,\n",
+ " name_y=y_name,\n",
+ " name_x=x_names,\n",
+ " name_w=\"baltim_q\",\n",
+ " name_ds=\"baltim\"\n",
+ ")\n",
+ "print(model.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## White Standard Errors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-03T16:26:50.112672Z",
+ "start_time": "2021-01-03T16:26:50.101959Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "REGRESSION\n",
+ "----------\n",
+ "SUMMARY OF OUTPUT: SPATIAL TWO STAGE LEAST SQUARES\n",
+ "--------------------------------------------------\n",
+ "Data set : baltim\n",
+ "Weights matrix : baltim_q\n",
+ "Dependent Variable : PRICE Number of Observations: 211\n",
+ "Mean dependent var : 44.3072 Number of Variables : 11\n",
+ "S.D. dependent var : 23.6061 Degrees of Freedom : 200\n",
+ "Pseudo R-squared : 0.7278\n",
+ "Spatial Pseudo R-squared: 0.6928\n",
+ "\n",
+ "White Standard Errors\n",
+ "------------------------------------------------------------------------------------\n",
+ " Variable Coefficient Std.Error z-Statistic Probability\n",
+ "------------------------------------------------------------------------------------\n",
+ " CONSTANT -2.5762742 7.0147591 -0.3672648 0.7134215\n",
+ " NROOM 0.9440746 1.4002856 0.6742015 0.5001832\n",
+ " NBATH 5.5981348 2.1605285 2.5910951 0.0095671\n",
+ " PATIO 5.8424768 2.9445656 1.9841558 0.0472385\n",
+ " FIREPL 6.4579185 2.4500195 2.6358641 0.0083923\n",
+ " AC 5.4871926 2.6021469 2.1087175 0.0349690\n",
+ " GAR 4.3565951 2.2070747 1.9739228 0.0483905\n",
+ " AGE -0.0730060 0.0976079 -0.7479516 0.4544894\n",
+ " LOTSZ 0.0579765 0.0237454 2.4415887 0.0146228\n",
+ " SQFT 0.0395330 0.2355809 0.1678105 0.8667323\n",
+ " W_PRICE 0.5823313 0.1325884 4.3920220 0.0000112\n",
+ "------------------------------------------------------------------------------------\n",
+ "Instrumented: W_PRICE\n",
+ "Instruments: W_AC, W_AGE, W_FIREPL, W_GAR, W_LOTSZ, W_NBATH, W_NROOM,\n",
+ " W_PATIO, W_SQFT\n",
+ "\n",
+ "DIAGNOSTICS FOR SPATIAL DEPENDENCE\n",
+ "TEST MI/DF VALUE PROB\n",
+ "Anselin-Kelejian Test 1 5.234 0.0221\n",
+ "================================ END OF REPORT =====================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "model = spreg.GM_Lag(\n",
+ " y,\n",
+ " x,\n",
+ " w=w,\n",
+ " robust=\"white\",\n",
+ " spat_diag=True,\n",
+ " name_y=y_name,\n",
+ " name_x=x_names,\n",
+ " name_w=\"baltim_q\",\n",
+ " name_ds=\"baltim\"\n",
+ ")\n",
+ "print(model.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "-----------------------------"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:py38_spreg]",
+ "language": "python",
+ "name": "conda-env-py38_spreg-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.6"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {
+ "height": "103px",
+ "width": "212px"
+ },
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/_sources/notebooks/Panel_FE_example.ipynb.txt b/_sources/notebooks/Panel_FE_example.ipynb.txt
new file mode 100644
index 00000000..1db9c440
--- /dev/null
+++ b/_sources/notebooks/Panel_FE_example.ipynb.txt
@@ -0,0 +1,367 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "------------\n",
+ "\n",
+ "# Spatial Panel Models with Fixed Effects\n",
+ "\n",
+ "* **This notebook uses the [Panel_FE_Lag](https://pysal.org/spreg/generated/spreg.Panel_FE_Lag.html#spreg.Panel_FE_Lag) and [Panel_FE_Error](https://pysal.org/spreg/generated/spreg.Panel_FE_Error.html#spreg.Panel_FE_Error) classes.**\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-04T16:36:53.158014Z",
+ "start_time": "2021-01-04T16:36:50.182287Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import numpy\n",
+ "import libpysal\n",
+ "import spreg"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Open data on NCOVR US County Homicides (3085 areas).**\n",
+ "\n",
+ "* First, extract the HR (homicide rates) data in the 70's, 80's and 90's as the dependent variable.\n",
+ "* Data can also be passed in the long format instead of wide format.\n",
+ " * i.e. a vector with $n \\times t$ rows and a single column for the dependent variable, and\n",
+ " * a matrix of dimension $n \\times (t \\ast k)$ for the independent variables\n",
+ "* Then, extract RD and PS as independent variables in the regression."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-04T16:36:53.489678Z",
+ "start_time": "2021-01-04T16:36:53.160457Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Open data on NCOVR US County Homicides (3085 areas).\n",
+ "nat = libpysal.examples.load_example(\"NCOVR\")\n",
+ "db = libpysal.io.open(nat.get_path(\"NAT.dbf\"), \"r\")\n",
+ "\n",
+ "# Create spatial weight matrix\n",
+ "nat_shp = libpysal.examples.get_path(\"NAT.shp\")\n",
+ "w = libpysal.weights.Queen.from_shapefile(nat_shp)\n",
+ "w.transform = 'r'\n",
+ "\n",
+ "# Define dependent variable\n",
+ "name_y = [\"HR70\", \"HR80\", \"HR90\"]\n",
+ "y = numpy.array([db.by_col(name) for name in name_y]).T\n",
+ "\n",
+ "# Define independent variables\n",
+ "name_x = [\"RD70\", \"RD80\", \"RD90\", \"PS70\", \"PS80\", \"PS90\"]\n",
+ "x = numpy.array([db.by_col(name) for name in name_x]).T"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "--------------------\n",
+ "\n",
+ "## Spatial Lag model\n",
+ "\n",
+ "Let's estimate a spatial lag panel model with fixed effects:\n",
+ "\n",
+ "$$\n",
+ "y = \\rho Wy + X\\beta + \\mu_i + e\n",
+ "$$"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-04T16:36:59.736302Z",
+ "start_time": "2021-01-04T16:36:53.492370Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "fe_lag = spreg.Panel_FE_Lag(y, x, w, name_y=name_y, name_x=name_x, name_ds=\"NAT\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-04T16:36:59.741882Z",
+ "start_time": "2021-01-04T16:36:59.737965Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "REGRESSION\n",
+ "----------\n",
+ "SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL LAG PANEL - FIXED EFFECTS\n",
+ "-----------------------------------------------------------------------\n",
+ "Data set : NAT\n",
+ "Weights matrix : unknown\n",
+ "Dependent Variable : HR Number of Observations: 9255\n",
+ "Mean dependent var : 0.0000 Number of Variables : 3\n",
+ "S.D. dependent var : 3.9228 Degrees of Freedom : 9252\n",
+ "Pseudo R-squared : 0.0319\n",
+ "Spatial Pseudo R-squared: 0.0079\n",
+ "Sigma-square ML : 14.935 Log likelihood : -67936.533\n",
+ "S.E of regression : 3.865 Akaike info criterion : 135879.066\n",
+ " Schwarz criterion : 135900.465\n",
+ "\n",
+ "------------------------------------------------------------------------------------\n",
+ " Variable Coefficient Std.Error z-Statistic Probability\n",
+ "------------------------------------------------------------------------------------\n",
+ " RD 0.8005886 0.1614474 4.9588189 0.0000007\n",
+ " PS -2.6003523 0.4935486 -5.2686851 0.0000001\n",
+ " W_HR 0.1903043 0.0159991 11.8947008 0.0000000\n",
+ "------------------------------------------------------------------------------------\n",
+ "Warning: Assuming panel is in wide format.\n",
+ "y[:, 0] refers to T0, y[:, 1] refers to T1, etc.\n",
+ "x[:, 0:T] refers to T periods of k1, x[:, T+1:2T] refers to k2, etc.\n",
+ "================================ END OF REPORT =====================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(fe_lag.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-04T16:36:59.753663Z",
+ "start_time": "2021-01-04T16:36:59.743818Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[ 0.8006],\n",
+ " [-2.6004],\n",
+ " [ 0.1903]])"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "numpy.around(fe_lag.betas, decimals=4)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data can also be in 'long' format:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "REGRESSION\n",
+ "----------\n",
+ "SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL LAG PANEL - FIXED EFFECTS\n",
+ "-----------------------------------------------------------------------\n",
+ "Data set : NAT\n",
+ "Weights matrix : unknown\n",
+ "Dependent Variable : HR Number of Observations: 9255\n",
+ "Mean dependent var : 0.0000 Number of Variables : 3\n",
+ "S.D. dependent var : 3.9228 Degrees of Freedom : 9252\n",
+ "Pseudo R-squared : 0.0319\n",
+ "Spatial Pseudo R-squared: 0.0079\n",
+ "Sigma-square ML : 14.935 Log likelihood : -67936.533\n",
+ "S.E of regression : 3.865 Akaike info criterion : 135879.066\n",
+ " Schwarz criterion : 135900.465\n",
+ "\n",
+ "------------------------------------------------------------------------------------\n",
+ " Variable Coefficient Std.Error z-Statistic Probability\n",
+ "------------------------------------------------------------------------------------\n",
+ " RD 0.8005886 0.1614474 4.9588189 0.0000007\n",
+ " PS -2.6003523 0.4935486 -5.2686851 0.0000001\n",
+ " W_HR 0.1903043 0.0159991 11.8947008 0.0000000\n",
+ "------------------------------------------------------------------------------------\n",
+ "Warning: Assuming panel is in long format.\n",
+ "y[0:N] refers to T0, y[N+1:2N] refers to T1, etc.\n",
+ "x[0:N] refers to T0, x[N+1:2N] refers to T1, etc.\n",
+ "================================ END OF REPORT =====================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "y_long = y.reshape((y.shape[0]*y.shape[1],1), order='F')\n",
+ "x_long = x.reshape((x.shape[0]*3,2), order='F')\n",
+ "\n",
+ "fe_lag_long = spreg.Panel_FE_Lag(y_long, x_long, w, name_y=name_y, name_x=name_x, name_ds=\"NAT\")\n",
+ "print(fe_lag_long.summary)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "------------------------\n",
+ "\n",
+ "## Spatial Error model\n",
+ "\n",
+ "Now, let's estimate a spatial error panel model with fixed effects:\n",
+ "\n",
+ "$$\n",
+ "y = X\\beta + \\mu_i + v\n",
+ "$$\n",
+ "\n",
+ "where\n",
+ "\n",
+ "$$\n",
+ "v = \\lambda W v + e\n",
+ "$$"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-04T16:37:03.722913Z",
+ "start_time": "2021-01-04T16:36:59.755557Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "fe_error = spreg.Panel_FE_Error(\n",
+ " y, x, w, name_y=name_y, name_x=name_x, name_ds=\"NAT\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-04T16:37:03.729503Z",
+ "start_time": "2021-01-04T16:37:03.726165Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "REGRESSION\n",
+ "----------\n",
+ "SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL ERROR PANEL - FIXED EFFECTS\n",
+ "-------------------------------------------------------------------------\n",
+ "Data set : NAT\n",
+ "Weights matrix : unknown\n",
+ "Dependent Variable : HR Number of Observations: 9255\n",
+ "Mean dependent var : 0.0000 Number of Variables : 2\n",
+ "S.D. dependent var : 3.9228 Degrees of Freedom : 9253\n",
+ "Pseudo R-squared : 0.0084\n",
+ "Sigma-square ML : 14.923 Log likelihood : -67934.005\n",
+ "S.E of regression : 3.863 Akaike info criterion : 135872.010\n",
+ " Schwarz criterion : 135886.276\n",
+ "\n",
+ "------------------------------------------------------------------------------------\n",
+ " Variable Coefficient Std.Error z-Statistic Probability\n",
+ "------------------------------------------------------------------------------------\n",
+ " RD 0.8697923 0.1718029 5.0627323 0.0000004\n",
+ " PS -2.9660674 0.5444783 -5.4475397 0.0000001\n",
+ " lambda 0.1943460 0.0160253 12.1274197 0.0000000\n",
+ "------------------------------------------------------------------------------------\n",
+ "Warning: Assuming panel is in wide format.\n",
+ "y[:, 0] refers to T0, y[:, 1] refers to T1, etc.\n",
+ "x[:, 0:T] refers to T periods of k1, x[:, T+1:2T] refers to k2, etc.\n",
+ "================================ END OF REPORT =====================================\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(fe_error.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-01-04T16:37:03.735854Z",
+ "start_time": "2021-01-04T16:37:03.731739Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[ 0.8698],\n",
+ " [-2.9661],\n",
+ " [ 0.1943]])"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "numpy.around(fe_error.betas, decimals=4)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "------------------"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python [conda env:myenv] *",
+ "language": "python",
+ "name": "conda-env-myenv-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/_sources/notebooks/skater_reg.ipynb.txt b/_sources/notebooks/skater_reg.ipynb.txt
new file mode 100644
index 00000000..bfbcf91c
--- /dev/null
+++ b/_sources/notebooks/skater_reg.ipynb.txt
@@ -0,0 +1,716 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0a500c14",
+ "metadata": {},
+ "source": [
+ "# Skater Regression"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1847d3c6",
+ "metadata": {},
+ "source": [
+ "### This notebook shows the use of the Skater Regression funcion (Skater_reg), introduced by Anselin & Amaral (2021). For more information on the method, check:\n",
+ "https://www.researchgate.net/publication/353411566_Endogenous_Spatial_Regimes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e4afc8e0",
+ "metadata": {},
+ "source": [
+ "In this example, in addition to the required packages, we will use geopandas to load the data and matplotlib to plot the results. Alternatively, PySAL's own IO could also be used to load the data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "a7a0de53",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Required imports\n",
+ "import libpysal as ps\n",
+ "import numpy as np\n",
+ "import spreg\n",
+ "from spreg.skater_reg import Skater_reg\n",
+ "\n",
+ "# Optional imports\n",
+ "import matplotlib.pyplot as plt\n",
+ "import geopandas as gpd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c01ad58d",
+ "metadata": {},
+ "source": [
+ "We use Messner et al. (2000) data on homicides and selected socio-economic characteristics for continental U.S. counties to exemplify the use of Skater_reg. It can be downloaded from PySAL's examples repository."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "52f82039",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the example from PySAL\n",
+ "ps.examples.load_example(\"NCOVR\")\n",
+ "data = gpd.read_file(ps.examples.get_path('NAT.shp')).set_index('FIPS')\n",
+ "\n",
+ "# Set depedent and independent variables and the W matrix.\n",
+ "y = data['HR90'].to_numpy()\n",
+ "x = data[['RD90','PS90','UE90']].to_numpy()\n",
+ "w = ps.weights.Queen.from_dataframe(data, use_index=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "25be6fcf",
+ "metadata": {},
+ "source": [
+ "Skater_reg by default uses Euclidean distance to compute the Minimum Spanning Tree (MST). Therefore, we standardize the variables that will be used to compute the MST before calling the main Skater_reg function. Here, we use the X variables to compute the MST. Alternative specifications can be used.\n",
+ "\n",
+ "We set the number of clusters to 20 and minimum quorum to 100."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "ae2bd006",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 1min 16s, sys: 31.6 s, total: 1min 48s\n",
+ "Wall time: 25.3 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "# Standardize the variables to be used to compute the minimum spanning tree (could add/remove any variable)\n",
+ "x_std = (x - np.mean(x,axis=0)) / np.std(x,axis=0)\n",
+ "\n",
+ "# Call the Skater_reg method based on OLS\n",
+ "results = Skater_reg().fit(20, w, x_std, {'reg':spreg.OLS,'y':y,'x':x}, quorum=100)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53218349",
+ "metadata": {},
+ "source": [
+ "The intermediate steps are stored in the attibute \\_trace. We can use this information to plot the decrease in the total sum of squared residuals by number of clusters. This information can be helpful to select the number of desired clusters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "4e862e49",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAGwCAYAAAC0HlECAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAABupUlEQVR4nO3deVyN6f8/8NfptBGVJJUiDGFkGWMIqYbKLk1CNNYwmKSsM2NsY2SXbDOGGFlmJjvzSUVRyBLGOjG2QmRJZGk53b8//Lq/jhbncOpU5/V8PM5D931f933e7+5y3l33dV+3RBAEAURERERULC11B0BERERUHrBoIiIiIlIAiyYiIiIiBbBoIiIiIlIAiyYiIiIiBbBoIiIiIlIAiyYiIiIiBWirO4CKJC8vD/fu3UPVqlUhkUjUHQ4REREpQBAEPH/+HJaWltDSKro/iUWTCt27dw/W1tbqDoOIiIg+QEpKCqysrIrczqJJhapWrQrgzTfd0NBQrbHk5OQgMjISrq6u0NHRUWsspUlT8waYuybmrql5A5qbu6bmDZRs7s+ePYO1tbX4OV4UFk0qlH9JztDQsEwUTZUrV4ahoaFG/WJpat4Ac9fE3DU1b0Bzc9fUvIHSyf19Q2s4EJyIiIhIASyaiIiIiBTAoomIiIhIASyaiIiIiBTAoomIiIhIASyaiIiIiBTAoomIiIhIASyaiIiIiBTAoomIiIhIAZwRvIyTyWSIi4tDamoqLCws4ODgAKlUqu6wiIiINA6LpjJsx44dGD9+PO7cuSOus7KyQnBwMDw8PNQYGRERkebh5bkyaseOHfD09JQrmADg7t278PT0xI4dO9QUGRERkWZi0VQGyWQyjB8/HoIgFNiWv87f3x8ymay0QyMiItJYLJrKoLi4uAI9TG8TBAEpKSmIi4srxaiIiIg0G4umMig1NVWl7YiIiOjjsWgqgywsLFTajoiIiD4ei6YyyMHBAVZWVpBIJIVul0gksLa2hoODQylHRkREpLlYNJVBUqkUwcHBAFBo4SQIApYtW8b5moiIiEoRi6YyysPDA+Hh4ahVq1aBbVWqVIGLi4saoiIiItJcLJrKMA8PD9y6dQsxMTHYsmWLWChlZmZi9erVao6OiIhIs7BoKuOkUimcnJwwYMAALF++XLxct3jxYrx8+VLN0REREWkOFk3lSKNGjdC3b18AQFpaGn799Vc1R0RERKQ5WDSVMz/88IP49cKFC/H69Ws1RkNERKQ5WDSVM3Z2dnB3dwcA3Lt3D6GhoeoNiIiISEOwaCqH3u5tCgoKQnZ2thqjISIi0gwsmsqhVq1aoWvXrgCA5ORkbNq0Sc0RERERVXwsmsqp6dOni1/PmzcPubm5aoyGiIio4mPRVE7Z29ujU6dOAIDr169j27Ztao6IiIioYmPRVI693ds0d+5cyGQyNUZDRERUsbFoKsc6duyIDh06AAD+/fdfbN++Xc0RERERVVwsmsoxiUQi19v0008/IS8vT40RERERVVwsmso5FxcXfPHFFwCACxcuYO/evWqOiIiIqGJi0VTOvdvbNGfOHAiCoMaIiIiIKiYWTRVA9+7d0aJFCwBAYmIiIiIi1BsQERFRBcSiqQKQSCRys4Szt4mIiEj1WDRVEH369MGnn34KADh+/DhiY2PVGxAREVEFw6KpgtDS0sL3338vLv/8889qjIaIiKjiYdFUgXh5eaFBgwYAgMOHD+Py5ctqjoiIiKjiYNFUgUilUnz33Xfi8l9//aXGaIiIiCoWFk0VzMCBA2FjYwMAOHv2LE6dOqXegIiIiCoIFk0VjI6ODqZNmyYuc2wTERGRarBoqoAGDx4MKysrAMD+/ftx7tw59QZERERUAbBoqoD09PQQGBgoLs+dO1eN0RAREVUMLJoqqGHDhsHY2BgAsH37dt5JR0RE9JFYNFVQlSpVgru7OwBAEAT2NhEREX0kFk0VWJcuXVC9enUAwLZt23Dt2jU1R0RERFR+sWiqwPT19TF+/HgAQF5eHubNm6fmiIiIiMovFk0V3JgxY8SxTZs2bcKtW7fUGg8REVF5pdaiycbGBhKJpMBr7NixAIDMzEyMGzcOVlZWqFSpEho3bozVq1fLHSMrKwvffvstTE1NYWBggF69euHOnTtybdLT0+Hj4wMjIyMYGRnBx8cHT58+lWuTnJyMnj17wsDAAKampvDz80N2dnaJ5l8aDA0Nxd6m3NxcBAUFqTkiIiKi8kmtRdOpU6eQmpoqvqKiogAAffv2BQBMmDABERERCAsLw5UrVzBhwgR8++232L17t3gMf39/7Ny5E9u2bUN8fDwyMzPRo0cPyGQysY23tzfOnTuHiIgIRERE4Ny5c/Dx8RG3y2QydO/eHS9evEB8fDy2bduG7du3y922X575+fmhatWqAIDQ0NACRSURERG9n1qLpho1asDc3Fx87du3D/Xr14ejoyMA4Pjx4xg8eDCcnJxgY2ODkSNHonnz5jh9+jQAICMjA+vWrcPixYvRuXNntGzZEmFhYbhw4QKio6MBAFeuXEFERAR+++032Nvbw97eHmvXrsW+ffuQlJQEAIiMjMTly5cRFhaGli1bonPnzli8eDHWrl2LZ8+eqeebo0ImJiZi7112djYWLlyo5oiIiIjKH211B5AvOzsbYWFhCAgIgEQiAQB06NABe/bswbBhw2BpaYnY2FhcvXoVwcHBAIDExETk5OTA1dVVPI6lpSWaNm2KY8eOwc3NDcePH4eRkRHatGkjtmnbti2MjIxw7Ngx2Nra4vjx42jatCksLS3FNm5ubsjKykJiYiKcnZ0LjTkrKwtZWVnicn6BlZOTg5ycHNV9cz5A/vvn//vtt99i+fLlePnyJX799VdMnDgR5ubm6gyxRLybtyZh7pqXu6bmDWhu7pqaN1CyuSt6zDJTNO3atQtPnz7FkCFDxHXLly+Hr68vrKysoK2tDS0tLfz222/o0KEDAOD+/fvQ1dVFtWrV5I5Vs2ZN3L9/X2xjZmZW4P3MzMzk2tSsWVNue7Vq1aCrqyu2Kcy8efMwa9asAusjIyNRuXJlxRIvYfmXPAGgc+fO2LNnD16/fg0/Pz+573VF83bemoa5ax5NzRvQ3Nw1NW+gZHJ/+fKlQu3KTNG0bt06dO3aVa63Z/ny5UhISMCePXtQp04dHDlyBGPGjIGFhQU6d+5c5LEEQRB7qwDIff0xbd41bdo0BAQEiMvPnj2DtbU1XF1dYWhoWHSypSAnJwdRUVFwcXGBjo4OAKBly5Y4cOAAsrKyEBUVhVWrVsHU1FStcapaYXlrCuaueblrat6A5uauqXkDJZu7okNxykTRdPv2bURHR2PHjh3iulevXuG7777Dzp070b17dwBAs2bNcO7cOSxatAidO3eGubk5srOzkZ6eLtfblJaWhnbt2gEAzM3N8eDBgwLv+fDhQ7F3ydzcHCdOnJDbnp6ejpycnAI9UG/T09ODnp5egfU6Ojpl5of57Vhq166NESNGYOXKlXjx4gVWrlyJn376Sc0RloyydA5KG3PXvNw1NW9Ac3PX1LyBksld0eOViXmaQkNDYWZmJhZHwP+NC9LSkg9RKpUiLy8PANCqVSvo6OjIddWlpqbi4sWLYtFkb2+PjIwMnDx5Umxz4sQJZGRkyLW5ePEiUlNTxTaRkZHQ09NDq1atVJ+wGk2ZMkX84QgJCSkw9QIREREVTu1FU15eHkJDQzF48GBoa/9fx5ehoSEcHR0xadIkxMbG4ubNm9iwYQN+//139OnTBwBgZGSE4cOHIzAwEAcPHsTZs2cxaNAg2NnZiZfvGjdujC5dusDX1xcJCQlISEiAr68vevToAVtbWwCAq6srmjRpAh8fH5w9exYHDx7ExIkT4evrq/bLbKpmbW0tjmV69uwZli9frt6AiIiIygm1F03R0dFITk7GsGHDCmzbtm0bWrdujYEDB6JJkyYICgrC3LlzMXr0aLHN0qVL4e7uDi8vL7Rv3x6VK1fG3r17IZVKxTabN2+GnZ0dXF1d4erqimbNmmHTpk3idqlUiv3790NfXx/t27eHl5cX3N3dsWjRopJNXk2mTp0qfn+WLVuG58+fqzkiIiKisk/tY5pcXV0hCEKh28zNzREaGlrs/vr6+ggJCUFISEiRbUxMTBAWFlbscWrXro19+/a9P+AKoF69ehg4cCB+//13pKenY9WqVZgyZYq6wyIiIirT1N7TROrx3XffiXcGLl68GC9evFBzRERERGUbiyYNZWtri379+gF4cyfh5MmTsXXrVsTGxso9goaIiIjeYNGkwb7//nvx61WrVsHb2xvOzs6wsbGRm/6BiIiIWDRptKtXrxa6/u7du/D09GThRERE9BYWTRpKJpNh/PjxhW7LH5jv7+/PS3VERET/H4smDRUXF4c7d+4UuV0QBKSkpCAuLq4UoyIiIiq7WDRpqLdnP1dFOyIiooqORZOGsrCwUGk7IiKiio5Fk4ZycHCAlZWVOFfTuyQSCaytreHg4FDKkREREZVNLJo0lFQqRXBwMAAUWjgJgoBly5bJPY6GiIhIk7Fo0mAeHh4IDw9HrVq1CmzT0tJC06ZN1RAVERFR2cSiScN5eHjg1q1biImJwZYtWzBkyBAAQF5eHqZOnare4IiIiMoQFk0EqVQKJycnDBgwACEhITA3NwcA7Ny5E/Hx8WqOjoiIqGxg0URyqlSpgtmzZ4vLkyZNEie7JCIi0mQsmqiAoUOHokmTJgCAhIQEhIeHqzkiIiIi9WPRRAVoa2tjwYIF4vK0adOQnZ2txoiIiIjUj0UTFapbt25wdnYGAFy/fh2rV69Wc0RERETqxaKJCiWRSLBw4UJxefbs2Xj69Kn6AiIiIlIzFk1UpFatWmHgwIEAgCdPniAoKEjNEREREakPiyYq1ty5c6GnpwcAWLZsGZKTk9UcERERkXooXTRt3LgR+/fvF5cnT54MY2NjtGvXDrdv31ZpcKR+derUgZ+fHwAgKysLP/zwg5ojIiIiUg+li6aff/4ZlSpVAgAcP34cK1aswIIFC2BqaooJEyaoPEBSv++++w4mJiYAgLCwMJw9e1bNEREREZU+pYumlJQUfPLJJwCAXbt2wdPTEyNHjsS8efMQFxen8gBJ/YyNjcUeJkEQOOElERFpJKWLpipVquDx48cAgMjISHTu3BkAoK+vj1evXqk2OiozxowZg7p16wIADh48iIiICDVHREREVLqULppcXFwwYsQIjBgxAlevXkX37t0BAJcuXYKNjY2q46MyQk9PD/PmzROXJ0+eDJlMpsaIiIiISpfSRdPKlSthb2+Phw8fYvv27ahevToAIDExEQMGDFB5gFR2eHl54YsvvgAAXLx4ERs2bFBvQERERKVIW9kdjI2NsWLFigLrZ82apZKAqOySSCRYtGgROnbsCACYPn06+vfvDwMDAzVHRkREVPIUKprOnz+v8AGbNWv2wcFQ2efg4IDevXtj9+7dSE1NxZIlSzB9+nR1h0VERFTiFCqaWrRoAYlEUuQdU/nbJBIJx7logPnz52Pfvn2QyWRYsGABRo4ciZo1a6o7LCIiohKlUNF08+bNko6DyhFbW1uMHDkSq1evRmZmJmbOnMkH+hIRUYWnUNFUp06dko6DypkZM2Zg06ZNyMzMxNq1azF+/Hg0atRI3WERERGVGKUHgue7fPkykpOTkZ2dLbe+V69eHx0UlX01a9bElClTMH36dMhkMkydOhW7du1Sd1hEREQlRumi6caNG+jTpw8uXLggN85JIpEAAMc0aZAJEyZg1apVSE1Nxe7duxEXFwcHBwd1h0VERFQilJ6nafz48ahbty4ePHiAypUr49KlSzhy5Ag+//xzxMbGlkCIVFYZGBhgzpw54vLEiRP5eBUiIqqwlC6ajh8/jtmzZ6NGjRrQ0tKClpYWOnTogHnz5sHPz68kYqQybMiQIWjatCkA4OTJk/jzzz/VHBEREVHJULpokslkqFKlCgDA1NQU9+7dA/BmsHhSUpJqo6MyTyqVYsGCBeLytGnTkJWVpcaIiIiISobSRVPTpk3FyS7btGmDBQsW4OjRo5g9ezbq1aun8gCp7OvSpQs6deoE4M30FKtWrVJzRERERKqndNH0ww8/IC8vDwDw008/4fbt23BwcMDff/+N5cuXqzxAKvskEgkWLlwo3gwwZ84cpKenqzkqIiIi1VK6aHJzc4OHhwcAoF69erh8+TIePXqEtLQ0fPnllyoPkMqHli1bYtCgQQCA9PR0/Pzzz2qOiIiISLWULpoKY2JiIvYykOb66aefoKenBwBYvnw5bt26pd6AiIiIVEjpeZqcnZ2LLZAOHTr0UQFR+VW7dm34+/tj/vz5yM7Oxvfff4/NmzerOywiIiKVULqnqUWLFmjevLn4atKkCbKzs3HmzBnY2dmVRIxUjkydOhXVq1cHAGzZsgWJiYlqjoiIiEg1lO5pWrp0aaHrZ86ciczMzI8OiMo3Y2NjTJ8+Hf7+/gCASZMm4eDBg7x8S0RE5Z5KxjQBwKBBg7B+/XpVHY7KsW+++Qb169cHAMTExODvv/9Wc0REREQfT2VF0/Hjx6Gvr6+qw1E5pquri3nz5onLkydPRm5urhojIiIi+nhKX57Ln24gnyAISE1NxenTpzF9+nSVBUblm6enJ9q2bYuEhARcvnwZoaGh8PX1VXdYREREH0zpniYjIyO5l4mJCZycnPD3339jxowZJREjlUMSiQSLFi0Sl3/88UeOeSMionJN6Z6m0NDQkoiDKqD27dujT58+2LlzJ+7fv49x48bBzc0NFhYWcHBwgFQqVXeIREREClPZmCaiwgQFBUFL682P2caNG+Ht7Q1nZ2fY2Nhgx44dao6OiIhIcQr1NFWrVk3hW8afPHnyUQFRxXLx4kXxWYVvu3v3Ljw9PREeHl5gnBwREVFZpFDRtGzZMvHrx48f46effoKbmxvs7e0BvLlz7sCBAxwITnJkMhnGjx9f6DZBECCRSODv74/evXvzUh0REZV5ChVNgwcPFr/+6quvMHv2bIwbN05c5+fnhxUrViA6OhoTJkxQfZRULsXFxeHOnTtFbhcEASkpKYiLi4OTk1PpBUZERPQBlB7TdODAAXTp0qXAejc3N0RHR6skKKoYUlNTVdqOiIhInZQumqpXr46dO3cWWL9r1y7xmWNEAGBhYaHSdkREROqk9JQDs2bNwvDhwxEbGyuOaUpISEBERAR+++03lQdI5ZeDgwOsrKxw9+5dCIJQaJtatWrBwcGhlCMjIiJSntI9TUOGDMGxY8dgbGyMHTt2YPv27TAyMsLRo0cxZMiQEgiRyiupVIrg4GAAKPLuSxsbG3FKAiIiorLsgz6t2rRpg82bN+PMmTM4e/YsNm/ejDZt2ih9HBsbG0gkkgKvsWPHim2uXLmCXr16wcjICFWrVkXbtm2RnJwsbs/KysK3334LU1NTGBgYoFevXgUGH6enp8PHx0ecxdzHxwdPnz6Va5OcnIyePXvCwMAApqam8PPzQ3Z2ttI5kTwPDw+Eh4ejVq1acuvzi6ijR49i48aN6giNiIhIKQoVTc+ePZP7uriXMk6dOoXU1FTxFRUVBQDo27cvAOD69evo0KEDGjVqhNjYWPzzzz+YPn263IOB/f39sXPnTmzbtg3x8fHIzMxEjx49IJPJxDbe3t44d+4cIiIiEBERgXPnzsHHx0fcLpPJ0L17d7x48QLx8fHYtm0btm/fjsDAQKXyocJ5eHjg1q1biImJwZYtW8R/840bNw7Xrl1TY4RERETvp/DklqmpqTAzM4OxsXGhl1ry5915u1h5nxo1asgtBwUFoX79+nB0dAQAfP/99+jWrRsWLFggtqlXr574dUZGBtatW4dNmzahc+fOAICwsDBYW1sjOjoabm5uuHLlCiIiIpCQkCD2hq1duxb29vZISkqCra0tIiMjcfnyZaSkpMDS0hIAsHjxYgwZMgRz586FoaGhwjlR4aRSaYFpBaKiorB+/Xq8ePECAwYMwLFjx6Crq6ueAImIiN5DoaLp0KFDMDExAQDExMSUSCDZ2dkICwtDQEAAJBIJ8vLysH//fkyePBlubm44e/Ys6tati2nTpsHd3R0AkJiYiJycHLi6uorHsbS0RNOmTXHs2DG4ubnh+PHjMDIykrt82LZtWxgZGeHYsWOwtbXF8ePH0bRpU7FgAt5MoZCVlYXExEQ4OzsXGnNWVhaysrLE5fyetpycHOTk5Kjy26O0/PdXdxzFWbx4MeLi4nDt2jUkJiZi2rRpCAoK+qhjloe8Swpz17zcNTVvQHNz19S8gZLNXdFjKlQ05ff8vPu1Ku3atQtPnz4VB5OnpaUhMzMTQUFB+OmnnzB//nxERETAw8MDMTExcHR0xP3796Grq4tq1arJHatmzZq4f/8+AOD+/fswMzMr8H5mZmZybWrWrCm3vVq1atDV1RXbFGbevHmYNWtWgfWRkZGoXLmyUvmXlPxLnmXV6NGjMWXKFOTm5mLJkiUwNDREixYtPvq4ZT3vksTcNY+m5g1obu6amjdQMrm/fPlSoXZKTzkQERGBKlWqoEOHDgCAlStXYu3atWjSpAlWrlxZoIBR1Lp169C1a1extyf/eWW9e/cWZxlv0aIFjh07hjVr1hRbvOVfKsxX3OVEZdq8a9q0aQgICBCXnz17Bmtra7i6uqr9kl5OTg6ioqLg4uICHR0dtcbyPjKZDJMnTwYArFmzBomJiQUu3SqqPOWtasxd83LX1LwBzc1dU/MGSjZ3RcdkK100TZo0CfPnzwcAXLhwAQEBAQgMDMShQ4cQEBCA0NBQZQ+J27dvIzo6Wu6p96amptDW1kaTJk3k2jZu3Bjx8fEAAHNzc2RnZyM9PV2uWEtLS0O7du3ENg8ePCjwng8fPhR7l8zNzXHixAm57enp6cjJySnQA/U2PT096OnpFVivo6NTZn6Yy1IsRQkMDER0dDQiIyNx//59jBw5Env37lX4IdGFKQ95lxTmrnm5a2regObmrql5AyWTu6LHU3rKgZs3b4qFzPbt29GzZ0/8/PPPWLVqFf73v/8pezgAQGhoKMzMzNC9e3dxna6uLlq3bo2kpCS5tlevXkWdOnUAAK1atYKOjo5cV11qaiouXrwoFk329vbIyMjAyZMnxTYnTpxARkaGXJuLFy/KPc4jMjISenp6aNWq1QflRIrT0tLCxo0bxcuo+/fvx4oVK9QcFRERkTyliyZdXV3x2l90dLQ4CNvExETpKQeAN5fhQkNDMXjwYGhry3d8TZo0CX/88QfWrl2L//77DytWrMDevXsxZswYAICRkRGGDx+OwMBAHDx4EGfPnsWgQYNgZ2cn3k3XuHFjdOnSBb6+vkhISEBCQgJ8fX3Ro0cP2NraAgBcXV3RpEkT+Pj44OzZszh48CAmTpwIX19ftV9m0xTm5ubYsGGDuDxp0iScP39efQERERG9Q+miqUOHDggICMCcOXNw8uRJsXfo6tWrsLKyUjqA6OhoJCcnY9iwYQW29enTB2vWrMGCBQtgZ2eH3377Ddu3bxfHUwHA0qVL4e7uDi8vL7Rv3x6VK1fG3r17IZVKxTabN2+GnZ0dXF1d4erqimbNmmHTpk3idqlUiv3790NfXx/t27eHl5cX3N3dsWjRIqXzoQ/XtWtX+Pv7A3hzZ+KAAQMUHpxHRERU0pQe07RixQqMGTMG4eHhWL16tTjT8//+9z906dJF6QBcXV2LfC4ZAAwbNqzQgiqfvr4+QkJCEBISUmQbExMThIWFFRtH7dq1sW/fvvcHTCUqKCgIMTEx+Oeff3D58mUEBgZi9erV6g6LiIhI+aKpqOJi6dKlKgmINJuenh62bt2KVq1a4dWrV1izZg1cXV3Rp08fdYdGREQa7oOePXf9+nX88MMPGDBgANLS0gC8mYrg0qVLKg2ONFPjxo3FB/0CwIgRIwo8T5CIiKi0KV00HT58GHZ2djhx4gR27NiBzMxMAMD58+cxY8YMlQdImmnEiBHw8PAAADx58gQ+Pj5KPaKHiIhI1ZQumqZOnYqffvoJUVFRcs8Jc3Z2xvHjx1UaHGkuiUSCtWvXijcXxMbGivODERERqYPSRdOFCxcKHV9So0YNPH78WCVBEQFvBvBv3rwZWlpvfkx//PHHApOQEhERlRaliyZjY2O5SSDznT17VryTjkhVOnbsiO+//x7Am8etDBgw4IPmAyMiIvpYShdN3t7emDJlCu7fvw+JRIK8vDwcPXoUEydOxNdff10SMZKG+/HHH2Fvbw/gzYz0+ZObEhERlSali6a5c+eidu3aqFWrFjIzM9GkSRN07NgR7dq1ww8//FASMZKG09bWxpYtW8TZ2Tdv3iw3OSkREVFpUKpoEgQB9+7dw9q1a3Ht2jX8+eefCAsLw7///otNmzbJzcJNpEo2Njb45ZdfxOUxY8bg+vXraoyIiIg0jVKTWwqCgAYNGuDSpUto0KAB6tWrV1JxERXQv39/HDhwABs2bEBmZia8vb0RHx+vsU/6JiKi0qVUT5OWlhYaNGjAu+RIbZYvX45PPvkEAHDy5En8+OOPao6IiIg0hdJjmhYsWIBJkybh4sWLJREPUbGqVq2KrVu3ir1L8+fPx6FDh9QcFRERaQKli6ZBgwbh5MmTaN68OSpVqgQTExO5F1FJ+/zzzzF37lwAby4Z+/j44NGjR2qOioiIKjqlH9i7bNmyEgiDSDmBgYGIjIxEdHQ07t27h+HDh2PXrl3qDouIiCowpYumwYMHl0QcRErR0tLC77//jmbNmuHRo0fYs2cPVq9eDV9fX3WHRkREFZTSRRNRWWFhYYHQ0FD07NkTABAQEAAdHR38+++/MDAwgLOzM6fBICIilVF6TBNRWdKjRw98++23AICsrCyMHDkSS5YsgYuLC2xsbLBjxw41R0hERBUFiyYq99q1a1fo+rt378LT05OFExERqQSLJirXZDIZJk2aVOg2QRAAAP7+/pDJZKUZFhERVUAsmqhci4uLw507d4rcLggCUlJSEBcXV4pRERFRRaTQQHAPDw+FD8hLIVSaUlNTVdqOiIioKAr1NBkZGYkvQ0NDHDx4EKdPnxa3JyYm4uDBgzAyMiqxQIkKY2FhodJ2RERERVGopyk0NFT8esqUKfDy8sKaNWvE27llMhnGjBkDQ0PDkomSqAgODg6wsrLC3bt3xTFMhXnw4EEpRkVERBWR0mOa1q9fj4kTJ8rNfyOVShEQEID169erNDii95FKpQgODgYASCSSItsNHDgQW7duLa2wiIioAlK6aMrNzcWVK1cKrL9y5Qry8vJUEhSRMjw8PBAeHo5atWrJrbe2tkanTp0AvOkNHTRoEMLCwtQRIhERVQBKzwg+dOhQDBs2DP/99x/atm0LAEhISEBQUBCGDh2q8gCJFOHh4YHevXsjJiYG//vf/9C1a1c4OztDIpHgm2++wa+//oq8vDx8/fXXkMlkfBwQEREpTemiadGiRTA3N8fSpUvFO5IsLCwwefJkBAYGqjxAIkVJpVI4OjrixYsXcHR0FC8hr169Gtra2li1ahUEQcDQoUMhk8kwbNgwNUdMRETlidJFk5aWFiZPnozJkyfj2bNnAMAB4FSmaWlpYcWKFdDW1sby5cshCAKGDx8OmUzGB/wSEZHCPmhyy9zcXERHR2Pr1q3i4Nt79+4hMzNTpcERqYpEIsGyZcswYcIEcd3IkSOxZs0aNUZFRETlidI9Tbdv30aXLl2QnJyMrKwsuLi4oGrVqliwYAFev37NDyEqsyQSCRYvXgypVIpFixYBAL755hvk5uZi3Lhxao6OiIjKOqV7msaPH4/PP/8c6enpqFSpkri+T58+OHjwoEqDI1I1iUSCBQsWYOrUqeK6b7/9FsuWLVNfUEREVC4o3dMUHx+Po0ePQldXV259nTp1cPfuXZUFRlRSJBIJfv75Z2hra+Onn34CAEyYMAEymYw3MxARUZGU7mnKy8sr9Inxd+7cQdWqVVUSFFFJk0gkmDNnDmbOnCmumzhxIhYsWKC+oIiIqExTumhycXGRu5QhkUiQmZmJGTNmoFu3bqqMjajEzZgxA7NnzxaXp0yZgp9//lmNERERUVml9OW5JUuW4Msvv0STJk3w+vVreHt749q1azA1NeVjKqhcmj59OrS1tfHdd98BAL7//nvk5ubixx9/VHNkRERUlihdNNWqVQvnzp3Dtm3bkJiYiLy8PAwfPhwDBw6UGxhOVJ5MmzYN2tramDx5MoA3PVAymQwzZ84s9pl2RESkOZQqmnJycmBra4t9+/Zh6NChfGwKVSiTJk2CtrY2AgICAACzZ8+GTCbDnDlzWDgREZFyY5p0dHSQlZXFDxCqsCZMmIDg4GBxee7cuZg2bRoEQVBjVEREVBYoPRD822+/xfz585Gbm1sS8RCpnZ+fH1asWCEuz58/H5MmTWLhRESk4ZQe03TixAkcPHgQkZGRsLOzg4GBgdz2HTt2qCw4InUZO3YstLW1MXr0aADA4sWLkZubi0WLFiE+Ph6pqamwsLCAg4OD+GBgIiKq2JQumoyNjfHVV1+VRCxEZcqoUaOgra0NX19fCIKA4OBgrFu3Tu4Zi1ZWVggODoaHh4caIyUiotKgdNEUGhpaEnEQlUnDhw+HVCoVb3p496HUd+/ehaenJ8LDw1k4ERFVcEqPaSLSND4+PjAxMSl0W/44J39//0JnyicioopD6Z4mAAgPD8eff/6J5ORkZGdny207c+aMSgIjKivi4uLw5MmTIrcLgoCUlBTExcXBycmp9AIjIqJSpXRP0/LlyzF06FCYmZnh7Nmz+OKLL1C9enXcuHEDXbt2LYkYidQqNTVVpe2IiKh8UrpoWrVqFX799VesWLECurq6mDx5MqKiouDn54eMjIySiJFIrSwsLFTajoiIyieli6bk5GS0a9cOAFCpUiU8f/4cwJtxH3z2HFVEDg4OsLKyKnZSVwMDA7Rv374UoyIiotKmdNFkbm6Ox48fAwDq1KmDhIQEAMDNmzc5+R9VSFKpVJwlvKjC6cWLFxg3bhzy8vJKMzQiIipFShdNX375Jfbu3Qvgze3YEyZMgIuLC/r164c+ffqoPECissDDwwPh4eGoVauW3Prq1atDS+vNr9Gvv/6KMWPGsHAiIqqglL577tdffxU/FEaPHg0TExPEx8ejZ8+e4uzJRBWRh4cHevfujbi4OLkZwcPDw+Ht7Y28vDz88ssvkEgkWLlypVhMERFRxaB00aSlpSX3YeDl5QUvLy+VBkVUVkml0gLTCvTr1w+CIGDgwIHIy8vDmjVrxMKJD7cmIqo4lC6ajhw5Uuz2jh07fnAwROVV//79IQgCBg0ahLy8PKxevRoAWDgREVUgShdNhU3e9/aHAmdFJk01YMAACIIAHx8fFk5ERBWQ0oMu0tPT5V5paWmIiIhA69atERkZWRIxEpUb3t7e+P3338VL2KtXr8a4ceN4ZykRUQWgdE+TkZFRgXUuLi7Q09PDhAkTkJiYqJLAiMqrgQMHQhAEDB48GHl5eVi1ahUkEglCQkLY40REVI6p7PaeGjVqICkpSVWHIyrXBg0ahI0bN4pF0sqVK+Hn58ceJyKickzpnqbz58/LLQuCgNTUVAQFBaF58+YqC4yovBs0aJDY4yQIAlasWAGJRILg4GD2OBERlUNK9zS1aNECLVu2RIsWLcSvu3XrhuzsbKxbt06pY9nY2EAikRR4jR07tkDbUaNGQSKRYNmyZXLrs7Ky8O2338LU1BQGBgbo1asX7ty5I9cmPT0dPj4+MDIygpGREXx8fPD06VO5NsnJyejZsycMDAxgamoKPz8/ZGdnK5UP0bt8fHywYcMGsUgKCQmBv78/e5yIiMohpXuabt68KbespaWFGjVqQF9fX+k3P3XqlNzddhcvXoSLiwv69u0r127Xrl04ceIELC0tCxzD398fe/fuxbZt21C9enUEBgaiR48eSExMhFQqBfBmcO6dO3cQEREBABg5ciR8fHzEmc1lMhm6d++OGjVqID4+Ho8fPxZ7B0JCQpTOi+htX3/9NQRBwNChQyEIApYvXw6JRIKlS5eyx4mIqBxRumiqU6eOyt68Ro0acstBQUGoX78+HB0dxXV3797FuHHjcODAAXTv3l2ufUZGBtatW4dNmzahc+fOAICwsDBYW1sjOjoabm5uuHLlCiIiIpCQkIA2bdoAANauXQt7e3skJSXB1tYWkZGRuHz5MlJSUsTCbPHixRgyZAjmzp0LQ0PDQuPPyspCVlaWuPzs2TMAQE5ODnJycj7yu/Nx8t9f3XGUtrKat7e3N3Jzc+Hr6wtBEBAcHAxBELBw4UKVFU5lNffSoKm5a2regObmrql5AyWbu6LHVLpoWr58ucJt/fz8FG6bnZ2NsLAwBAQEiB8ieXl58PHxwaRJk/Dpp58W2CcxMRE5OTlwdXUV11laWqJp06Y4duwY3NzccPz4cRgZGYkFEwC0bdsWRkZGOHbsGGxtbXH8+HE0bdpUrifLzc0NWVlZSExMhLOzc6Exz5s3D7NmzSqwPjIyEpUrV1Y495IUFRWl7hDUoizmbWpqinHjxmHFihVij9OtW7cwdOhQlfY4lcXcS4um5q6peQOam7um5g2UTO4vX75UqJ3SRdPSpUvx8OFDvHz5EsbGxgCAp0+fonLlynI9RxKJRKmiadeuXXj69CmGDBkirps/fz60tbWLPM79+/ehq6uLatWqya2vWbMm7t+/L7YxMzMrsK+ZmZlcm5o1a8ptr1atGnR1dcU2hZk2bRoCAgLE5WfPnsHa2hqurq5F9k6VlpycHERFRcHFxQU6OjpqjaU0lfW8u3XrBjs7O4waNQqCIGDPnj2oV68e5s+f/9GFU1nPvSRpau6amjegublrat5Ayeaef6XofZQumubOnYtVq1Zh3bp1sLW1BQAkJSXB19cXo0aNwsCBA5U9JABg3bp16Nq1q9jbk5iYiODgYJw5c0bpDxNBEOT2KWz/D2nzLj09Pejp6RVYr6OjU2Z+mMtSLKWpLOft6+sLqVSKESNGQBAELFu2DFKpVGWX6spy7iVNU3PX1LwBzc1dU/MGSiZ3RY+n9N1z06dPR0hIiFgwAYCtrS2WLl2KH374QdnDAQBu376N6OhojBgxQlwXFxeHtLQ01K5dG9ra2tDW1sbt27cRGBgIGxsbAIC5uTmys7ORnp4ud7y0tDSx58jc3BwPHjwo8J4PHz6Ua/Nuj1J6ejpycnIK9EARqcKwYcPw22+/icuLFy/G5MmTeVcdEVEZpnTRlJqaWuiAKZlMVmhxoojQ0FCYmZnJDfT28fHB+fPnce7cOfFlaWmJSZMm4cCBAwCAVq1aQUdHR+76ZmpqKi5evIh27doBAOzt7ZGRkYGTJ0+KbU6cOIGMjAy5NhcvXkRqaqrYJjIyEnp6emjVqtUH5UT0Pu8WTosWLcLkyZORm5uL2NhYbN26FbGxsXyeIxFRGaH05blOnTrB19cX69atQ6tWrSCRSHD69GmMGjVKvINNGXl5eQgNDcXgwYOhrf1/4VSvXh3Vq1eXa6ujowNzc3Oxl8vIyAjDhw9HYGAgqlevDhMTE0ycOBF2dnZiLI0bN0aXLl3g6+uLX375BcCbKQd69OghHsfV1RVNmjSBj48PFi5ciCdPnmDixInw9fVV+9gkqtiGDx8OQRDg6+sL4E3htGbNGmRmZoptrKysEBwcDA8PD3WFSURE+ICepvXr16NWrVr44osvoK+vDz09PbRp0wYWFhZyfzUrKjo6GsnJyRg2bJjS+wJvBqa7u7vDy8sL7du3R+XKlbF3715xjiYA2Lx5M+zs7ODq6gpXV1c0a9YMmzZtErdLpVLs378f+vr6aN++Pby8vODu7o5FixZ9UExEyhgxYgR+/fVXcfntggl4M+2Gp6cnduzYUdqhERHRW5TuaapRowb+/vtvXLt2DVeuXIEgCGjcuDEaNmz4QQG4uroqPI7j1q1bBdbp6+sjJCSk2EkoTUxMEBYWVuyxa9eujX379ikUB5GqDRs2DFOmTCkwPg/4vxsS/P390bt3b7k/CIiIqPR88AN7GzRogF69eqFHjx54+fJlof/ZE5Fi4uLiiv0dEgQBKSkpiIuLK8WoiIjobUoXTf7+/uIz5mQyGRwdHfHZZ5/B2toasbGxqo6PSCO8fROCKtoREZHqKV00hYeHo3nz5gCAvXv34saNG/j333/h7++P77//XuUBEmkCCwsLlbYjIiLVU7poevToEczNzQEAf//9N7y8vNCwYUMMHz4cFy5cUHmARJrAwcEBVlZWxU5uaWRkBAcHh1KMioiI3qZ00VSzZk1cvnwZMpkMERER4q39L1++5ABVog8klUoRHBwMoPDZ6YE3D6jmHZ1EROqjdNE0dOhQeHl5oWnTppBIJHBxcQHwZsLIRo0aqTxAIk3h4eGB8PBw1KpVS259/jMeAWDq1KlYvHhxKUdGRETAB0w5MHPmTDRt2hQpKSno27ev+Ow1qVSKqVOnqjxAIk3i4eGB3r17Iy4uDqmpqbCwsICDgwMWLlyIadOmAQAmTpwIbW1tjB8/Xs3REhFpFqWLJgDw9PQssG7w4MEfHQwRvfkDxMnJSW7d1KlTkZubi+nTpwN4cxertrY2xo4dq4YIiYg00wfP00REpeuHH37AjBkzxOVx48ZhzZo1aoyIiEizsGgiKkdmzJghN7XHN99880GPLyIiIuWxaCIqRyQSCebMmYMpU6aI60aOHInQ0FA1RkVEpBkUKpoCAgLw4sULAMCRI0eQm5tbokERUdEkEgnmzZuHwMBAAG8esTJ8+PD3Pl+RiIg+jkJFU0hIiPjkdWdnZzx58qREgyKi4kkkEixcuFC8g04QBIwYMQKHDx9Wc2RERBWXQnfP2djYYPny5XB1dYUgCDh+/DiqVatWaNuOHTuqNEAiKpxEIsHSpUuRm5uLlStXIi8vD8HBwWjVqhUGDhyo7vCIiCochYqmhQsXYvTo0Zg3bx4kEgn69OlTaDuJRAKZTKbSAImoaBKJBCEhIZDJZFizZg3y8vIwePBg6OnpFTo1CBERfTiFiiZ3d3e4u7sjMzMThoaGSEpKgpmZWUnHRkQKkEgkWLlyJbKzs7F+/XrIZDIMGDAAUqm0yD9wiIhIeUrdPVelShXExMSgbt26MDIyKvRFRKVPS0sLq1atwpdffgkAyM3NhZeXF/bs2aPmyIiIKg6lZwR3dHSETCbD9u3bceXKFUgkEjRu3Bi9e/fmA3uJ1EhLSwtjx46FhYUFNm/ejNzcXHh6emLnzp3o3r27usMjIir3lC6a/vvvP3Tv3h137tyBra0tBEHA1atXYW1tjf3796N+/folEScRKUAqleK3336DIAjYsmULcnJy4OHhgT179sDNzU3d4RERlWtKT27p5+eHevXqISUlBWfOnMHZs2eRnJyMunXrws/PryRiJCIlSKVSbNy4Ef369QMAZGdno3fv3oiOjlZzZERE5ZvSRdPhw4exYMECmJiYiOuqV6+OoKAgzhFDVEZoa2sjLCxMvIMuKysLPXv2xKFDh9QcGRFR+aV00aSnp4fnz58XWJ+ZmQldXV2VBEVEH09bWxtbtmwR76B7/fo1evbsyT9uiIg+kNJFU48ePTBy5EicOHECgiBAEAQkJCRg9OjR6NWrV0nESEQfSEdHB9u2bUPPnj0BAC9fvkT37t0RHx8PmUyG2NhYbN26FbGxsZxjjYjoPZQumpYvX4769evD3t4e+vr60NfXR/v27fHJJ58gODi4JGIkoo+gq6uLv/76C926dQMAvHjxAi4uLrC0tISzszO8vb3h7OwMGxsb7NixQ83REhGVXUrfPWdsbIzdu3fjv//+w5UrVyAIApo0aYJPPvmkJOIjIhXQ09PD9u3b4e7ujgMHDuD169d4/fq1XJu7d+/C09MT4eHh8PDwUFOkRERll9JFU75PPvmEhRJROaKvr4/w8HCYmpoiKyurwHZBECCRSODv789514iICqH05TkiKr9Onz5daMGUTxAEpKSkIC4urhSjIiIqH1g0EWmQ1NRUlbYjItIkLJqINIiFhYVK2xERaRIWTUQaxMHBAVZWVpBIJEW2qVy5Mr744otSjIqIqHz4oIHgr1+/xvnz55GWloa8vDy5bZyriajskkqlCA4OhqenJyQSCQRBKNDm5cuX6NKlC3bt2iU38z8RkaZTumiKiIjA119/jUePHhXYJpFIOEEeURnn4eGB8PBwjB8/Hnfu3BHXm5qa4tmzZ8jOzkZcXBzatWuHv//+G/Xq1VNjtEREZYfSl+fGjRuHvn37IjU1FXl5eXIvFkxE5YOHhwdu3bqFmJgYbNmyBTExMbh//z6OHj2KmjVrAgCSkpJgb2+PkydPqjlaIqKyQemiKS0tDQEBAeJ/rERUPkmlUjg5OWHAgAFwcnKCVCrF559/joSEBDRu3BjAm993Jycn7N69W83REhGpn9JFk6enJ2JjY0sgFCIqC2xsbHD06FE4OjoCAF69eoU+ffpg+fLlao6MiEi9lB7TtGLFCvTt2xdxcXGws7ODjo6O3HY/Pz+VBUdE6lGtWjUcOHAAw4YNw5YtWyAIAsaPH4+bN29i0aJFnC2ciDSS0kXTli1bcODAAVSqVAmxsbFyty5LJBIWTUQVhJ6eHsLCwlC3bl3MnTsXALBs2TLcvn0bYWFhqFy5spojJCIqXUpfnvvhhx8we/ZsZGRk4NatW7h586b4unHjRknESERqIpFI8NNPP2Ht2rVi79LOnTvx5ZdfIi0tTc3RERGVLqWLpuzsbPTr1w9aWpwXk0hTjBgxAvv370eVKlUAACdOnIC9vT2uXr2q5siIiEqP0pXP4MGD8ccff5RELERUhrm5uSE+Ph61atUCANy4cQP29vY4evSomiMjIiodSo9pkslkWLBgAQ4cOIBmzZoVGAi+ZMkSlQVHRGVL8+bNkZCQgO7du+P8+fN48uQJOnXqhN9//x1eXl7qDo+IqEQpXTRduHABLVu2BABcvHhRbltxz7MioorBysoKcXFx8PT0RFRUFLKystCvXz/cunULkyZN4v8DRFRhKV00xcTElEQcRFSOGBoaYv/+/Rg9ejTWr18PAJgyZQpu3ryJkJAQaGt/0GMtiYjKNI7mJqIPoqOjg99++w1z5swR161Zswbu7u7IzMxUY2RERCVD6T8HnZ2di+1+P3To0EcFRETlh0QiwQ8//AAbGxsMGzYMOTk52L9/PxwdHbFv3z6YmZkhLi4OqampsLCwgIODAyfGJKJyS+miqUWLFnLLOTk5OHfuHC5evIjBgwerKi4iKkcGDRqEWrVqoU+fPsjIyMCZM2fQrFkzSKVSPHjwQGxnZWWF4OBgeHh4qDFaIqIPo3TRtHTp0kLXz5w5k13yRBrM2dkZx44dQ7du3XD79m08evSoQJu7d+/C09MT4eHhLJyIqNxR2ZimQYMGiQNCiUgzNWnSBEePHi0wFUk+QRAAAP7+/pDJZKUZGhHRR1NZ0XT8+HHo6+ur6nBEVE5du3YNOTk5RW4XBAEpKSmIi4srxaiIiD6e0pfn3u1SFwQBqampOH36NKZPn66ywIiofEpNTVVpOyKiskLposnIyEhuWUtLC7a2tpg9ezZcXV1VFhgRlU8WFhYqbUdEVFYoXTSFhoaWRBxEVEE4ODjAysoKd+/eFccwFebUqVNwdHTkDOJEVG4oPaYpJSUFd+7cEZdPnjwJf39//PrrryoNjIjKJ6lUiuDgYADFP1pp8uTJ6N+/P++6JaJyQ+miydvbW3yUyv3799G5c2ecPHkS3333HWbPnq3yAImo/PHw8EB4eDhq1aolt97KygpfffWVuPznn3/iiy++QFJSUmmHSESkNKWLposXL+KLL74A8OY/PDs7Oxw7dgxbtmzBhg0bVB0fEZVTHh4euHXrFmJiYrBlyxbExMTg1q1bCA8Px+7du2FoaAgAuHLlClq3bo2dO3eqOWIiouIpXTTl5ORAT08PABAdHY1evXoBABo1asS7YYhIjlQqhZOTEwYMGAAnJyfxESq9evXC6dOn0bRpUwDA8+fP4eHhgWnTpiE3N1edIRMRFUnpounTTz/FmjVrEBcXh6ioKHTp0gUAcO/ePVSvXl2pY9nY2EAikRR4jR07Fjk5OZgyZQrs7OxgYGAAS0tLfP3117h3757cMbKysvDtt9/C1NQUBgYG6NWrl9yYKwBIT0+Hj48PjIyMYGRkBB8fHzx9+lSuTXJyMnr27AkDAwOYmprCz88P2dnZyn57iEhBDRo0QEJCAgYMGCCuCwoKQpcuXfDw4UM1RkZEVDili6b58+fjl19+Ef96bN68OQBgz5494mU7RZ06dQqpqaniKyoqCgDQt29fvHz5EmfOnMH06dNx5swZ7NixA1evXhV7tvL5+/tj586d2LZtG+Lj45GZmYkePXrIzTbs7e2Nc+fOISIiAhERETh37hx8fHzE7TKZDN27d8eLFy8QHx+Pbdu2Yfv27QgMDFT220NESjAwMMDmzZuxbNkyaGu/uZn34MGDaNWqFU6dOqXm6IiI5Ck95YCTkxMePXqEZ8+eoVq1auL6kSNHonLlykodq0aNGnLLQUFBqF+/vngbcn4RlS8kJARffPEFkpOTUbt2bWRkZGDdunXYtGkTOnfuDAAICwuDtbU1oqOj4ebmhitXriAiIgIJCQlo06YNAGDt2rWwt7dHUlISbG1tERkZicuXLyMlJQWWlpYAgMWLF2PIkCGYO3euOPbiXVlZWcjKyhKXnz17BuDNJcziZkQuDfnvr+44Spum5g2U79zHjBmDZs2awdvbG/fv30dKSgo6dOiA4OBgDB8+/L37l+fcP4am5g1obu6amjdQsrkrekyJUNxEKqUoOzsblpaWCAgIwHfffVdom+joaLi6uuLp06cwNDTEoUOH0KlTJzx58kSugGvevDnc3d0xa9YsrF+/HgEBAQUuxxkbG2Pp0qUYOnQofvzxR+zevRv//POPuD09PR0mJiY4dOgQnJ2dC41n5syZmDVrVoH1W7ZsUbqAJCLgyZMnWLhwIa5cuSKu69y5M0aOHAldXV01RkZEFdnLly/h7e2NjIyMIjtKgA/oaSopu3btwtOnTzFkyJBCt79+/RpTp06Ft7e3mND9+/ehq6srVzABQM2aNXH//n2xjZmZWYHjmZmZybWpWbOm3PZq1apBV1dXbFOYadOmISAgQFx+9uwZrK2t4erqWuw3vTTk5OQgKioKLi4uRT48tSLS1LyBipO7l5cXpk6dihUrVgB488fSkydP8Mcff6BOnTqF7lNRcleWpuYNaG7umpo3ULK5518pep8yUzStW7cOXbt2FS+PvS0nJwf9+/dHXl4eVq1a9d5jCYIgN6leYRPsfUibd+np6Yl3Er5NR0enzPwwl6VYSpOm5g2U/9x1dHQQEhKCtm3bwtfXF69evcKZM2fQtm1bbN26FS4uLsXuW55z/1Camjegublrat5AyeSu6PGUHgheEm7fvo3o6GiMGDGiwLacnBx4eXnh5s2biIqKkuvBMTc3R3Z2NtLT0+X2SUtLE3uOzM3N8eDBgwLHffjwoVybd3uU0tPTkZOTU6AHiohKx8CBA5GQkID69esDAB4/fgw3Nzf8/PPPyMvLU3N0RKSJykTRFBoaCjMzM3Tv3l1ufX7BdO3aNURHRxeY0qBVq1bQ0dGRGzCempqKixcvol27dgAAe3t7ZGRk4OTJk2KbEydOICMjQ67NxYsX5eaZioyMhJ6eHlq1aqXyfIlIMc2aNcPp06fRo0cPAG96f7///nt4eHggIyMDwJu7Xw8fPowjR47g8OHDcnfOEhGpkkKX55YvX67wAf38/JQKIC8vD6GhoRg8eLB4yzEA5ObmwtPTE2fOnMG+ffsgk8nE3iATExPo6urCyMgIw4cPR2BgIKpXrw4TExNMnDgRdnZ24t10jRs3RpcuXeDr64tffvkFwJs7/Xr06AFbW1sAgKurK5o0aQIfHx8sXLgQT548wcSJE+Hr66v2sUlEms7Y2Bi7d+/G3LlzMWPGDAiCgN27d6N169YYM2YMFi9eLM7NtmTJElhZWSE4OBgeHh5qjpyIKhqFiqalS5cqdDCJRKJ00RQdHY3k5GQMGzZMbv2dO3ewZ88eAECLFi3ktsXExMDJyUmMTVtbG15eXnj16hU6deqEDRs2iDMPA8DmzZvh5+cHV1dXAG9mI84fZAq8mbV4//79GDNmDNq3b49KlSrB29sbixYtUioXIioZWlpamD59Olq3bg1vb2+kp6fj2rVrmDBhQoG2d+/ehaenJ8LDw1k4EZFKKVQ03bx5s8QCcHV1RWGzHtjY2BS6/l36+voICQlBSEhIkW1MTEwQFhZW7HFq166Nffv2vT9gIlKbLl26IDExER4eHjh37lyhbfJv4PD390fv3r3l/oAiIvoYZWJMExGRourWrYugoKBi2wiCgJSUFMTFxZVSVESkCT5oyoH8S2fJyckFns+2ZMkSlQRGRFSUJ0+eKNSODxEnIlVSumg6ePAgevXqhbp16yIpKQlNmzbFrVu3IAgCPvvss5KIkYhIjoWFhUrbEREpQunLc9OmTUNgYCAuXrwIfX19bN++HSkpKXB0dETfvn1LIkYiIjkODg6wsrIqdvJZ4M2NJm8/H5KI6GMoXTRduXIFgwcPBgBoa2vj1atXqFKlCmbPno358+erPEAiondJpVIEBwcDKHw2/3xz587FZ599hhMnTpRWaERUgSldNBkYGIh/uVlaWuL69evitkePHqkuMiKiYnh4eCA8PBy1atWSW29lZQUvLy9x3rfLly+jXbt2mDRpEl6+fKmOUImoglC6aGrbti2OHj0KAOjevTsCAwMxd+5cDBs2DG3btlV5gERERfHw8MCtW7cQFRWFgIAAREVF4datW/jjjz9w+vRpcZxlXl4eFi1ahObNm+PIkSNqjpqIyiuli6YlS5agTZs2AICZM2fCxcVFfPr4unXrVB4gEVFxpFIpHB0d0bFjRzg6OorzMjVv3hwnTpzAvHnzxAdr//fff3B0dMTYsWPx/PlzdYZNROWQ0kVTvXr10KxZMwBA5cqVsWrVKpw/fx47duxAnTp1VB4gEdGH0tbWxtSpU3Hu3DnxWZMAsGrVKjRt2hSRkZFqjI6IypsPKpoeP35cYP3Tp09Rr149lQRFRKRKjRo1wpEjRxAcHIzKlSsDAJKTk+Hm5oZhw4YhPT1dzRESUXmgdNF069atQp8inpWVhbt376okKCIiVZNKpfDz88OFCxfw5ZdfiutDQ0Px6aefYvfu3WqMjojKA4Unt8x/eC4AHDhwAEZGRuKyTCbDwYMHYWNjo9LgiIhUrV69eoiOjsZvv/2GwMBAPH/+HKmpqXB3d0f//v2xfPly1KhRQ91hElEZpHDR5O7uDuDNnCj58zTl09HRgY2NDRYvXqzS4IiISoJEIoGvry+6dOmC0aNH4++//wYAbNu2DdHR0QgJCUG/fv3eO3kmEWkWhS/P5eXlIS8vD7Vr10ZaWpq4nJeXh6ysLCQlJaFHjx4lGSsRkUpZW1tj3759+P3331GtWjUAb+abGzBgAPr06SM+u04mkyE2NhZbt25FbGxsoUMUiKjiU3pM082bN2FqaloSsRARlTqJRAIfHx9cvnwZX331lbh+9+7daNKkCcaNGwcbGxs4OzvD29sbzs7OsLGxwY4dO9QYNRGpg9JFEwAcPnwYPXv2xCeffIIGDRqgV69eiIuLU3VsRESlxtzcHOHh4fjrr79gZmYG4M1dwStXrsSdO3fk2t69exeenp4snIg0jNJFU1hYGDp37ozKlSvDz88P48aNQ6VKldCpUyds2bKlJGIkIio1np6euHz5Mry9vYtsIwgCAMDf35+X6og0iNJF09y5c7FgwQL88ccf8PPzw/jx4/HHH38gKCgIc+bMKYkYiYhKVfXq1eHr61tsG0EQkJKSwl52Ig2idNF048YN9OzZs8D6Xr164ebNmyoJiohI3fIHgauqHRGVf0oXTdbW1jh48GCB9QcPHoS1tbVKgiIiUjcLCwuF2lWtWrWEIyGiskLheZqGDRuG4OBgBAYGws/PT3yWk0QiQXx8PDZs2IDg4OCSjJWIqNQ4ODjAysoKd+/eFccwFWbMmDGoUqUKnJycSi84IlILhXuaNm7ciFevXuGbb77Btm3bcOHCBfj7+2P8+PG4ePEi/vjjD4waNaokYyUiKjVSqVT8Q7C4SS5TUlLw5ZdfYuLEiXj9+nVphUdEaqBw0fT2X1p9+vRBfHw8Hj9+jMePHyM+Ph69e/cukQCJiNTFw8MD4eHhqFWrltx6a2trrFmzBo6OjgDe/P+4ePFitG7dGv/88486QiWiUqDUmCY+UoCINI2Hhwdu3bqFmJgYbNmyBTExMbh58yZGjRqFQ4cOYdGiRdDV1QUAXLx4Ea1bt8aCBQs4FQFRBaTwmCYAaNiw4XsLpydPnnxUQEREZY1UKi10zJKWlhYCAwPh6uqKQYMG4fz588jJycGUKVOwb98+bNy4EXXr1i39gImoRChVNM2aNQtGRkYlFQsRUblkZ2eHkydP4scff8TChQshCALi4uLQrFkzLF++HEOGDGFPPVEFoFTR1L9/f/HxAkRE9H/09PQwf/58dO/eHYMHD8atW7eQmZmJYcOGYc+ePfj1119Ro0YNdYdJRB9B4TFN/CuJiOj9OnbsiH/++QdDhw4V1+3atQtNmzbFvn371BgZEX2sD7p7joiIimZoaIj169djx44dMDU1BQCkpaWhZ8+eGDVqFDIzM9UcIRF9CIWLpry8PF6aIyJSQp8+fXDhwgV0795dXPfrr7+iRYsWOH78uBojI6IPofRjVIiISHHm5ubYu3cvfvnlFxgYGAAArl+/jg4dOuCHH35AdnY2AEAmkyE2NhZbt25FbGwspywgKoNYNBERlTCJRIKRI0fi3LlzsLe3B/Cm937u3Lmwt7fH8uXLYWNjA2dnZ3h7e8PZ2Rk2NjbYsWOHmiMnorexaCIiKiWffPIJjhw5gp9++gna2m9uXj5z5gzGjx+PO3fuyLW9e/cuPD09WTgRlSEsmoiISpG2tja+//57JCQkoFGjRkW2y7/5xt/fn5fqiMoIFk1ERGrQqlUrLFu2rNg2giAgJSUFcXFxpRMUERWLRRMRkZoo+tip1NTUEo6EiBTBoomISE0sLCwUanft2jXOlUdUBrBoIiJSEwcHB1hZWb33iQszZsxA+/btcfTo0VKKjIgKw6KJiEhNpFIpgoODAbz/UVXHjx9Hhw4d4OHhgaSkpNIIj4jewaKJiEiNPDw8EB4ejlq1asmtt7a2Rnh4OPbs2YPGjRuL63fu3IlPP/0UY8aMwYMHD0o7XCKNxqKJiEjNPDw8cOvWLcTExGDLli2IiYnBzZs38dVXX6Fnz544f/481q5dK46BkslkWL16NRo1aoRt27bxWXZEpYRFExFRGSCVSuHk5IQBAwbAyckJUqlU3KatrY0RI0bg2rVrmDNnDqpUqQIAePHiBbZt24bGjRvjl19+QW5urrrCJ9IILJqIiMoJAwMD/PDDD7h+/TrGjh0rzir+4MEDjB49Gk2bNsXu3bt5px1RCWHRRERUzpiZmWHFihVyz7IDgKSkJLi7u6Njx45ISEhQY4REFROLJiKicqphw4aYMmUKjhw5gg4dOojr4+PjYW9vD09PT1y7dk1cL5PJEBsbi61btyI2NpaPZyFSEosmIqJyrm3btjhy5Ah27doFW1tbcf327dvRpEkTjBs3DqGhobCxsYGzszO8vb3h7OwMGxsbPhCYSAksmoiIKgCJRILevXvj4sWLWLNmDWrWrAkAyM3NxcqVKzFs2DDcuXNHbp+7d+/C09OThRORglg0ERFVINra2hg1ahT+++8/zJo1C5UrVy6ybf6AcX9/f16qI1IAiyYiogqoSpUq+PHHH7Fp06Zi2wmCgJSUFMTFxZVSZETlF4smIqIKLCsrS6F2P/74I6Kjo9njRFQMFk1ERBVY/izi7xMXFwcXFxdYW1tj4sSJOHfuHOd7InoHiyYiogrMwcEBVlZWxT4Q+O1tqampWLx4MVq2bAk7OzvMnz8fKSkppREqUZnHoomIqAKTSqUIDg4GgAKFk0QigUQiwZYtW/DXX3+hd+/e0NHREbdfunQJU6dORZ06deDs7Ix169bh6dOnpRk+UZnCoomIqILz8PBAeHg4atWqJbfeysoK4eHh6N+/Pzw9PbFr1y6kpqZi9erVaNeundhOEATExsZixIgRMDc3R9++fbFnzx5kZ2cXeC9OoEkVGYsmIiIN4OHhgVu3biEmJgZbtmxBTEwMbt68CQ8PD7l21atXx+jRo3H06FFcv34ds2fPRsOGDcXtWVlZCA8PR+/evWFhYYExY8bg2LFjEAQBO3bs4ASaVKGxaCIi0hBSqRROTk4YMGAAnJycIJVKi21fr149TJ8+Hf/++y9OnjwJPz8/1KhRQ9z+5MkTrF69Gu3bt4eFhQW++uorTqBJFZpaiyYbGxvxmvrbr7FjxwJ40yU8c+ZMWFpaolKlSnBycsKlS5fkjpGVlYVvv/0WpqamMDAwQK9evQr80qanp8PHxwdGRkYwMjKCj49PgevyycnJ6NmzJwwMDGBqago/P79Cu56JiDSNRCJB69atERwcjLt372L//v0YMGAAKlWqJLZ58OBBoftyAk2qSNRaNJ06dQqpqaniKyoqCgDQt29fAMCCBQuwZMkSrFixAqdOnYK5uTlcXFzw/Plz8Rj+/v7YuXMntm3bhvj4eGRmZqJHjx5yv5ze3t44d+4cIiIiEBERgXPnzsHHx0fcLpPJ0L17d7x48QLx8fHYtm0btm/fjsDAwFL6ThARlQ86Ojro1q0btmzZggcPHmDjxo1o1apVsftwAk2qKLTV+eZvd/MCQFBQEOrXrw9HR0cIgoBly5bh+++/F6+5b9y4ETVr1sSWLVswatQoZGRkYN26ddi0aRM6d+4MAAgLC4O1tTWio6Ph5uaGK1euICIiAgkJCWjTpg0AYO3atbC3t0dSUhJsbW0RGRmJy5cvIyUlBZaWlgCAxYsXY8iQIZg7dy4MDQ0LjT8rK0tu4rhnz54BAHJycpCTk6Pab5aS8t9f3XGUNk3NG2Dub/+rKdSdt76+PgYMGACJRIKvv/76ve39/f0xfvx49OrVq8j/VxWl7tzVRVPzBko2d0WPKRHKyOxl2dnZsLS0REBAAL777jvcuHED9evXx5kzZ9CyZUuxXe/evWFsbIyNGzfi0KFD6NSpE548eYJq1aqJbZo3bw53d3fMmjUL69evR0BAQIHLccbGxli6dCmGDh2KH3/8Ebt378Y///wjbk9PT4eJiQkOHToEZ2fnQmOeOXMmZs2aVWD9li1bin3eExFRRXLhwgVMnz5d4fY6Ojpo1aoVHBwc8Pnnn0NPT68EoyN6v5cvX8Lb2xsZGRnFFvRq7Wl6265du/D06VMMGTIEAHD//n0AEJ/Una9mzZq4ffu22EZXV1euYMpvk7///fv3YWZmVuD9zMzM5Nq8+z7VqlWDrq6u2KYw06ZNQ0BAgLj87NkzWFtbw9XV9aP/ivpYOTk5iIqKgouLi9y8KxWdpuYNMHdNzL2s5O3m5oY1a9bg3r17Rc4iLpVKxWETOTk5SEhIQEJCAqpUqYKePXvCy8sLLi4u0NXVVeg9y0rupU1T8wZKNvf8K0XvU2aKpnXr1qFr167i5bF8707GJghCsTPbFtamsPYf0uZdenp6hf6FpKOjU2Z+mMtSLKVJU/MGmLsm5q7uvHV0dLB8+XJ4enpCIpHIFU75/4du27YNFhYW2LZtG/7880+kpaUBADIzM7F161Zs3boVxsbG+Oqrr9C/f384OTlBW7vwjyiZTIZjx47hyJEjMDAwgLOz83vvBKxo1H3O1akkclf0eGViyoHbt28jOjoaI0aMENeZm5sDQIGenrS0NLFXyNzcHNnZ2UhPTy+2TWF3dTx8+FCuzbvvk56ejpycnAI9UEREVND7JtD09PRE+/btERISgrt37yIqKgrDhw+HsbGx2Pbp06dYt24dXFxcUKtWLYwbNw7x8fHIy8sT2+TPBeXi4oIlS5bAxcWFc0FRqSkTRVNoaCjMzMzQvXt3cV3dunVhbm4u3lEHvBn3dPjwYXGm2latWkFHR0euTWpqKi5evCi2sbe3R0ZGBk6ePCm2OXHiBDIyMuTaXLx4EampqWKbyMhI6OnpvfeuECIiekPRCTS1tbXRuXNn/Pbbb3jw4AH27t2LgQMHwsDAQGyTlpaGlStXwsHBAXXq1MHEiROxcOFCeHp6ci4oUhu1X57Ly8tDaGgoBg8eLNcVK5FI4O/vj59//hkNGjRAgwYN8PPPP6Ny5crw9vYGABgZGWH48OEIDAxE9erVYWJigokTJ8LOzk68m65x48bo0qULfH198csvvwAARo4ciR49esDW1hYA4OrqiiZNmsDHxwcLFy7EkydPMHHiRPj6+qp9bBIRUXmSP4GmonR1ddGjRw/06NEDL1++xP79+7Ft2zbs379fvDv5zp07WLx4cZHHyB9K4e/vj969e2vcpToqPWrvaYqOjkZycjKGDRtWYNvkyZPh7++PMWPG4PPPP8fdu3cRGRmJqlWrim2WLl0Kd3d3eHl5oX379qhcuTL27t0r90uzefNm2NnZwdXVFa6urmjWrBk2bdokbpdKpdi/fz/09fXRvn17eHl5wd3dHYsWLSrZ5ImISFS5cmX07dsX27dvR1paGn7//Xd069atyLFNb+NcUFQa1N7T5OrqWuTdFhKJBDNnzsTMmTOL3F9fXx8hISEICQkpso2JiQnCwsKKjaN27drYt2+fQjETEVHJMjQ0hI+PD3x8fPD48WNMnToVv/3223v3S05OLoXoSFOpvaeJiIioONWrV8fAgQMVajtu3DiMGzcOp0+fLvIPcqIPxaKJiIjKPAcHB1hZWb13ypnnz59j5cqVaN26Nezs7LBw4UK5m3yIPgaLJiIiKvOkUimCg4MBFJxXL3/ZwcEB+vr64vpLly5h8uTJsLKyQvfu3fHXX3/h9evXpRc0VTgsmoiIqFwobi6o7du348iRI7h//z7Wrl2L9u3bi9vz8vLw999/w8vLCxYWFhgzZgxOnjzJy3ekNBZNRERUbuTPBRUVFYWAgABERUXJzQVlZGSEESNGID4+HlevXsUPP/wAa2trcf+nT59i9erVaNOmDZo0aYL58+fj7t27Bd5HJpMhNjYWW7duRWxsrPgIGNJsLJqIiKhckUqlcHR0RMeOHeHo6FjkvEwNGjTAnDlzcOvWLURHR8PHx0fuYer//vsvpk6ditq1a6NLly7Ytm0bXr16Jc467uzsDG9vbzg7O3PWcQLAoomIiCo4LS0tdOrUCb///jvu37+P9evXo2PHjuL2vLw8HDhwAAMGDICpqSm++uorzjpOhWLRREREGqNq1aoYOnQoDh8+jOvXr2PGjBmwsbERt798+bLQ/fLHP/n7+/NSnQZj0URERBqpXr16mDlzJq5fv46YmBi4ubkV2z5/1nEfHx/8+eefuH79OgeTaxi1zwhORESkTlpaWnByckJqaioOHDjw3vZbt27F1q1bAbwZeN6yZUu0atUKn332GT777DM0bNgQWlrv75OQyWSIi4tDamoqLCws4ODgwOfmlXEsmoiIiABYWFgovU9GRgZiY2MRGxsrrqtSpQpatGiBzz77TCymGjVqJPcMvR07dmD8+PFyY6esrKwQHBws3glIZQ+LJiIiIvzfrON3794t9LKbRCKBhYUFVq5ciX/++QeJiYk4c+ZMgSkLMjMzER8fj/j4eHFdpUqV0Lx5c3z22WcAgNWrVxd4j/zB5uHh4SycyigWTURERPi/Wcc9PT0hkUjkipr8WcdDQkLg7u4Od3d3cduDBw9w5swZ8ZWYmIjbt2/LHfvVq1dISEhAQkJCke8vCAIkEgn8/f3Ru3dvXqorg1g0ERER/X/5s44Xduls2bJlhfYA1axZE127dkXXrl3FdY8fPy5QSF2/fv29758/2Nze3h6Ojo5o1qwZ7Ozs0LhxY+jp6SmdD8dNqRaLJiIiord4eHigd+/eH1VsVK9eHS4uLnBxcRHXPX36FIsXL8ZPP/303v1PnTqFU6dOictSqRS2trb49NNPoauri7y8PLRs2RJ16tQp8iHGHDeleiyaiIiI3iGVSuHk5KTSYxobG6NTp04KFU3vkslkuHz5Mi5fvgwA2Lx5M4A3807Z2dmJr/yeqUOHDsHT07NUxk1pUm8WiyYiIqJSoshgcysrK8THx+Py5cu4cOECzp8/jwsXLuDKlSvIzs6Wa//8+XMcO3YMx44dk1svlUoLPb6qx01pWm8WiyYiIqJSoshg82XLlqF27driM/Hy5eTk4NKlSwgLC4O2tjYuXbqECxcuFBh0DqDYWcvzx001adIE9evXh5mZGWrUqCG+3l42MzODgYFBocfZsWNHqfZmHT58GEeOHIGBgQGcnZ3V0pvFoomIiKgUfchgcwDQ0dHBp59+CgcHB3Tr1g06OjoA3swVdfHiRVy4cAEXLlzAwYMHkZSU9N44rl69iqtXr763XaVKlQoUVNWrV8f69evV0pu1ZMkStfVmsWgiIiIqZaoYbJ7PyMgI7du3R/v27QEAsbGxcHZ2Vlmsr169QnJyMpKTkxXe5+3erNq1a6NatWqoVq0aTExMCv06f7lq1apyA9tLszdLESyaiIiI1KAkBpsDio+b+u+//5CRkYGHDx8iLS0NDx8+FF/vLj98+BCPHj1CXl6eUrEo2puVTyqVwtjYGCYmJjA2Nsb58+dLpTdLUSyaiIiIKhBFx03p6uqKl92aNGny3uPKZDKkp6fj4cOHiIyMhL+//3v30dLSUqrQkslkePz4MR4/fvzetvm9WXFxcSVSfBaGRRMREVEF86HjpoojlUphamoKU1NTNGzYEIsWLXpvb9aNGzfw8uVLPHnyBOnp6UhPT5f7+t3lt79++vSpQnGlpqYqncuHYtFERERUAaly3NS7FO3N0tbWhqGhIQwNDWFjY6PUexw8eBCdO3d+b7sPedDyh9IqtXciIiKiUpU/bmrAgAFwcnJS6dif/N6sWrVqya23srJSyQBtJycnWFlZFTnjuUQigbW1NRwcHD7qfZTBniYiIiL6IGWhN6s052ti0UREREQfrKTuAgRKZmzWx2DRRERERGVWfm9WTEwM/ve//6Fr166cEZyIiIioMFKpFI6Ojnjx4gUcHR3V9kBgDgQnIiIiUgCLJiIiIiIFsGgiIiIiUgCLJiIiIiIFsGgiIiIiUgCLJiIiIiIFsGgiIiIiUgCLJiIiIiIFsGgiIiIiUgBnBFeh/IcJPnv2TM2RADk5OXj58iWePXsGHR0ddYdTajQ1b4C5a2Lumpo3oLm5a2reQMnmnv+5/fZDgQvDokmFnj9/DgCwtrZWcyRERESkrOfPn8PIyKjI7RLhfWUVKSwvLw/37t1D1apVIZFI1BrLs2fPYG1tjZSUFBgaGqo1ltKkqXkDzF0Tc9fUvAHNzV1T8wZKNndBEPD8+XNYWlpCS6vokUvsaVIhLS0tWFlZqTsMOYaGhhr3iwVobt4Ac9fE3DU1b0Bzc9fUvIGSy724HqZ8HAhOREREpAAWTUREREQKYNFUQenp6WHGjBnQ09NTdyilSlPzBpi7JuauqXkDmpu7puYNlI3cORCciIiISAHsaSIiIiJSAIsmIiIiIgWwaCIiIiJSAIsmIiIiIgWwaCqH5s2bh9atW6Nq1aowMzODu7s7kpKSit0nNjYWEomkwOvff/8tpag/3syZMwvEb25uXuw+hw8fRqtWraCvr4969ephzZo1pRStatnY2BR6/saOHVto+/J8vo8cOYKePXvC0tISEokEu3btktsuCAJmzpwJS0tLVKpUCU5OTrh06dJ7j7t9+3Y0adIEenp6aNKkCXbu3FlCGXyY4vLOycnBlClTYGdnBwMDA1haWuLrr7/GvXv3ij3mhg0bCv05eP36dQlno5z3nfMhQ4YUyKFt27bvPW5ZP+fA+3Mv7PxJJBIsXLiwyGOWh/OuyOdYWfxdZ9FUDh0+fBhjx45FQkICoqKikJubC1dXV7x48eK9+yYlJSE1NVV8NWjQoBQiVp1PP/1ULv4LFy4U2fbmzZvo1q0bHBwccPbsWXz33Xfw8/PD9u3bSzFi1Th16pRc3lFRUQCAvn37FrtfeTzfL168QPPmzbFixYpCty9YsABLlizBihUrcOrUKZibm8PFxUV89mNhjh8/jn79+sHHxwf//PMPfHx84OXlhRMnTpRUGkorLu+XL1/izJkzmD59Os6cOYMdO3bg6tWr6NWr13uPa2hoKPczkJqaCn19/ZJI4YO975wDQJcuXeRy+Pvvv4s9Znk458D7c3/33K1fvx4SiQRfffVVscct6+ddkc+xMvm7LlC5l5aWJgAQDh8+XGSbmJgYAYCQnp5eeoGp2IwZM4TmzZsr3H7y5MlCo0aN5NaNGjVKaNu2rYojK33jx48X6tevL+Tl5RW6vSKcb0EQBADCzp07xeW8vDzB3NxcCAoKEte9fv1aMDIyEtasWVPkcby8vIQuXbrIrXNzcxP69++v8phV4d28C3Py5EkBgHD79u0i24SGhgpGRkaqDa6EFZb74MGDhd69eyt1nPJ2zgVBsfPeu3dv4csvvyy2TXk87+9+jpXV33X2NFUAGRkZAAATE5P3tm3ZsiUsLCzQqVMnxMTElHRoKnft2jVYWlqibt266N+/P27cuFFk2+PHj8PV1VVunZubG06fPo2cnJySDrXEZGdnIywsDMOGDXvvg6HL+/l+182bN3H//n2586qnpwdHR0ccO3asyP2K+lkobp+yLiMjAxKJBMbGxsW2y8zMRJ06dWBlZYUePXrg7NmzpROgisXGxsLMzAwNGzaEr68v0tLSim1fEc/5gwcPsH//fgwfPvy9bcvbeX/3c6ys/q6zaCrnBEFAQEAAOnTogKZNmxbZzsLCAr/++iu2b9+OHTt2wNbWFp06dcKRI0dKMdqP06ZNG/z+++84cOAA1q5di/v376Ndu3Z4/Phxoe3v37+PmjVryq2rWbMmcnNz8ejRo9IIuUTs2rULT58+xZAhQ4psUxHOd2Hu378PAIWe1/xtRe2n7D5l2evXrzF16lR4e3sX++DSRo0aYcOGDdizZw+2bt0KfX19tG/fHteuXSvFaD9e165dsXnzZhw6dAiLFy/GqVOn8OWXXyIrK6vIfSraOQeAjRs3omrVqvDw8Ci2XXk774V9jpXV33VtlRyF1GbcuHE4f/484uPji21na2sLW1tbcdne3h4pKSlYtGgROnbsWNJhqkTXrl3Fr+3s7GBvb4/69etj48aNCAgIKHSfd3tihP8/Af77emjKsnXr1qFr166wtLQssk1FON/FKey8vu+cfsg+ZVFOTg769++PvLw8rFq1qti2bdu2lRsw3b59e3z22WcICQnB8uXLSzpUlenXr5/4ddOmTfH555+jTp062L9/f7EFREU55/nWr1+PgQMHvndsUnk778V9jpW133X2NJVj3377Lfbs2YOYmBhYWVkpvX/btm3L7F8eijAwMICdnV2ROZibmxf46yItLQ3a2tqoXr16aYSocrdv30Z0dDRGjBih9L7l/XwDEO+WLOy8vvvX5bv7KbtPWZSTkwMvLy/cvHkTUVFRxfYyFUZLSwutW7cu9z8HFhYWqFOnTrF5VJRzni8uLg5JSUkf9Ltfls97UZ9jZfV3nUVTOSQIAsaNG4cdO3bg0KFDqFu37gcd5+zZs7CwsFBxdKUnKysLV65cKTIHe3t78S6zfJGRkfj888+ho6NTGiGqXGhoKMzMzNC9e3el9y3v5xsA6tatC3Nzc7nzmp2djcOHD6Ndu3ZF7lfUz0Jx+5Q1+QXTtWvXEB0d/UGFvyAIOHfuXLn/OXj8+DFSUlKKzaMinPO3rVu3Dq1atULz5s2V3rcsnvf3fY6V2d91lQwnp1L1zTffCEZGRkJsbKyQmpoqvl6+fCm2mTp1quDj4yMuL126VNi5c6dw9epV4eLFi8LUqVMFAML27dvVkcIHCQwMFGJjY4UbN24ICQkJQo8ePYSqVasKt27dEgShYM43btwQKleuLEyYMEG4fPmysG7dOkFHR0cIDw9XVwofRSaTCbVr1xamTJlSYFtFOt/Pnz8Xzp49K5w9e1YAICxZskQ4e/aseJdYUFCQYGRkJOzYsUO4cOGCMGDAAMHCwkJ49uyZeAwfHx9h6tSp4vLRo0cFqVQqBAUFCVeuXBGCgoIEbW1tISEhodTzK0pxeefk5Ai9evUSrKyshHPnzsn93mdlZYnHeDfvmTNnChEREcL169eFs2fPCkOHDhW0tbWFEydOqCPFIhWX+/Pnz4XAwEDh2LFjws2bN4WYmBjB3t5eqFWrVrk/54Lw/p93QRCEjIwMoXLlysLq1asLPUZ5PO+KfI6Vxd91Fk3lEIBCX6GhoWKbwYMHC46OjuLy/Pnzhfr16wv6+vpCtWrVhA4dOgj79+8v/eA/Qr9+/QQLCwtBR0dHsLS0FDw8PIRLly6J29/NWRAEITY2VmjZsqWgq6sr2NjYFPmfTnlw4MABAYCQlJRUYFtFOt/50yW8+xo8eLAgCG9uRZ4xY4Zgbm4u6OnpCR07dhQuXLggdwxHR0exfb6//vpLsLW1FXR0dIRGjRqVuQKyuLxv3rxZ5O99TEyMeIx38/b39xdq164t6OrqCjVq1BBcXV2FY8eOlX5y71Fc7i9fvhRcXV2FGjVqCDo6OkLt2rWFwYMHC8nJyXLHKI/nXBDe//MuCILwyy+/CJUqVRKePn1a6DHK43lX5HOsLP6uS/5/8ERERERUDI5pIiIiIlIAiyYiIiIiBbBoIiIiIlIAiyYiIiIiBbBoIiIiIlIAiyYiIiIiBbBoIiIiIlIAiyYiIiIiBbBoIqJy4datW5BIJDh37py6QxH9+++/aNu2LfT19dGiRQul9y+LORFR0Vg0EZFChgwZAolEgqCgILn1u3btgkQiUVNU6jVjxgwYGBggKSkJBw8eVHc42LBhA4yNjdUdBlGFxaKJiBSmr6+P+fPnIz09Xd2hqEx2dvYH73v9+nV06NABderUQfXq1VUYlXrJZDLk5eWpOwyiModFExEprHPnzjA3N8e8efOKbDNz5swCl6qWLVsGGxsbcXnIkCFwd3fHzz//jJo1a8LY2BizZs1Cbm4uJk2aBBMTE1hZWWH9+vUFjv/vv/+iXbt20NfXx6efforY2Fi57ZcvX0a3bt1QpUoV1KxZEz4+Pnj06JG43cnJCePGjUNAQABMTU3h4uJSaB55eXmYPXs2rKysoKenhxYtWiAiIkLcLpFIkJiYiNmzZ0MikWDmzJlFHmf+/Pn45JNPoKenh9q1a2Pu3LmFti2sp+jdnrx//vkHzs7OqFq1KgwNDdGqVSucPn0asbGxGDp0KDIyMiCRSORiys7OxuTJk1GrVi0YGBigTZs2ct+3/Pfdt28fmjRpAj09Pdy+fRuxsbH44osvYGBgAGNjY7Rv3x63b98uNHYiTcCiiYgUJpVK8fPPPyMkJAR37tz5qGMdOnQI9+7dw5EjR7BkyRLMnDkTPXr0QLVq1XDixAmMHj0ao0ePRkpKitx+kyZNQmBgIM6ePYt27dqhV69eePz4MQAgNTUVjo6OaNGiBU6fPo2IiAg8ePAAXl5ecsfYuHEjtLW1cfToUfzyyy+FxhccHIzFixdj0aJFOH/+PNzc3NCrVy9cu3ZNfK9PP/0UgYGBSE1NxcSJEws9zrRp0zB//nxMnz4dly9fxpYtW1CzZs0P/r4NHDgQVlZWOHXqFBITEzF16lTo6OigXbt2WLZsGQwNDZGamioX09ChQ3H06FFs27YN58+fR9++fdGlSxcxFwB4+fIl5s2bh99++w2XLl2CiYkJ3N3d4ejoiPPnz+P48eMYOXKkxl6KJQIACEREChg8eLDQu3dvQRAEoW3btsKwYcMEQRCEnTt3Cm//VzJjxgyhefPmcvsuXbpUqFOnjtyx6tSpI8hkMnGdra2t4ODgIC7n5uYKBgYGwtatWwVBEISbN28KAISgoCCxTU5OjmBlZSXMnz9fEARBmD59uuDq6ir33ikpKQIAISkpSRAEQXB0dBRatGjx3nwtLS2FuXPnyq1r3bq1MGbMGHG5efPmwowZM4o8xrNnzwQ9PT1h7dq1hW7Pz+ns2bOCIAhCaGioYGRkJNfm3e9v1apVhQ0bNhR6vML2/++//wSJRCLcvXtXbn2nTp2EadOmifsBEM6dOyduf/z4sQBAiI2NLTI/Ik3DniYiUtr8+fOxceNGXL58+YOP8emnn0JL6//+C6pZsybs7OzEZalUiurVqyMtLU1uP3t7e/FrbW1tfP7557hy5QoAIDExETExMahSpYr4atSoEYA344/yff7558XG9uzZM9y7dw/t27eXW9++fXvxvRRx5coVZGVloVOnTgrv8z4BAQEYMWIEOnfujKCgILm8CnPmzBkIgoCGDRvKfV8OHz4st6+uri6aNWsmLpuYmGDIkCFwc3NDz549ERwcjNTUVJXlQVQesWgiIqV17NgRbm5u+O677wps09LSgiAIcutycnIKtNPR0ZFblkgkha5TZEBy/iWjvLw89OzZE+fOnZN7Xbt2DR07dhTbGxgYvPeYbx83nyAISl2eqlSpksJtAcW+dzNnzsSlS5fQvXt3HDp0CE2aNMHOnTuLPGZeXh6kUikSExPlvidXrlxBcHCwXKzv5hYaGorjx4+jXbt2+OOPP9CwYUMkJCQolRNRRcKiiYg+SFBQEPbu3Ytjx47Jra9Rowbu378v9+GvynmI3v7Qzs3NRWJiotib9Nlnn+HSpUuwsbHBJ598IvdStFACAENDQ1haWiI+Pl5u/bFjx9C4cWOFj9OgQQNUqlRJ4ekIatSogefPn+PFixfiusK+dw0bNsSECRMQGRkJDw8PhIaGAnjTWySTyeTatmzZEjKZDGlpaQW+J+bm5u+NqWXLlpg2bRqOHTuGpk2bYsuWLQrlQlQRsWgiog9iZ2eHgQMHIiQkRG69k5MTHj58iAULFuD69etYuXIl/ve//6nsfVeuXImdO3fi33//xdixY5Geno5hw4YBAMaOHYsnT55gwIABOHnyJG7cuIHIyEgMGzasQDHxPpMmTcL8+fPxxx9/ICkpCVOnTsW5c+cwfvx4hY+hr6+PKVOmYPLkyfj9999x/fp1JCQkYN26dYW2b9OmDSpXrozvvvsO//33H7Zs2YINGzaI21+9eoVx48YhNjYWt2/fxtGjR3Hq1CmxkLOxsUFmZiYOHjyIR48e4eXLl2jYsCEGDhyIr7/+Gjt27MDNmzdx6tQpzJ8/H3///XeRsd+8eRPTpk3D8ePHcfv2bURGRuLq1atKFY1EFQ2LJiL6YHPmzClwOalx48ZYtWoVVq5ciebNm+PkyZNF3ln2IYKCgjB//nw0b94ccXFx2L17N0xNTQEAlpaWOHr0KGQyGdzc3NC0aVOMHz8eRkZGcuOnFOHn54fAwEAEBgbCzs4OERER2LNnDxo0aKDUcaZPn47AwED8+OOPaNy4Mfr161dgnFY+ExMThIWF4e+//4adnR22bt0qN5WBVCrF48eP8fXXX6Nhw4bw8vJC165dMWvWLABAu3btMHr0aPTr1w81atTAggULALy5zPb1118jMDAQtra26NWrF06cOAFra+si465cuTL+/fdffPXVV2jYsCFGjhyJcePGYdSoUUrlT1SRSIR3/8cjIiIiogLY00RERESkABZNRERERApg0URERESkABZNRERERApg0URERESkABZNRERERApg0URERESkABZNRERERApg0URERESkABZNRERERApg0URERESkgP8HeW7RDjXuVoYAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ "