-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinferential_stats2.R
203 lines (162 loc) · 6.97 KB
/
inferential_stats2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
##################################
# Data Science Course
# Kyle Ott & Cornelius Schneider
# 12 December 2014
##################################
# Load packages
library(httr)
library(dplyr)
library(XML)
library(ggplot2)
library(stringr)
library(car)
library(devtools)
library(rsdmx)
library(stargazer)
library(knitr)
library(CausalImpact)
library(sandwich)
library(lmtest)
library(plm)
#################
# Time Series
#################
#set your local directory
setwd("/Users/Kyle/Dropbox/!Fall_2014/Collab_Data/Final_Project/")
# from loop_best2.R
load("cleaned.RData")
# Creating Summary Stats of Time Series
cleaned2 <- group_by(cleaned, year)
# number of observations by year
obs_all <- tally(cleaned2)
# total income by year
t1_totinc <- summarise(cleaned2, mean1=mean(total_inc),median1=median(total_inc), sd1=sd(total_inc))
# total taxes paid by year
t1_tottax <- summarise(cleaned2, mean2=mean(taxes_paid),median2=median(taxes_paid), sd2=sd(taxes_paid))
# average tax rate by year
t1_rat <- summarise(cleaned2, mean3=mean(ratio),median3=median(ratio), sd3=sd(ratio))
t1a <- merge(obs_all, t1_totinc,
by = c('year'))
t1b <- merge(t1a, t1_tottax,
by = c('year'))
t1c <- merge(t1b, t1_rat,
by = c('year'))
# now we want to use ungrouped data to get the totals for all
t1_totinc_sum <- summarise(cleaned, mean1=mean(total_inc),median1=median(total_inc), sd1=sd(total_inc))
t1_tottax_sum <- summarise(cleaned, mean2=mean(taxes_paid),median2=median(taxes_paid), sd2=sd(taxes_paid))
t1_rat_sum <- summarise(cleaned, mean3=mean(ratio),median3=median(ratio), sd3=sd(ratio))
obs_all_sum$year <- c('sum')
t1_totinc_sum$year <- c('sum')
t1_tottax_sum$year <- c('sum')
t1_rat_sum$year <- c('sum')
t1a_sum <- merge(obs_all_sum, t1_totinc_sum,
by = c('year'))
t1b_sum <- merge(t1a_sum, t1_tottax_sum,
by = c('year'))
t1c_sum <- merge(t1b_sum, t1_rat_sum,
by = c('year'))
summarytableTS <- rbind(t1c, t1c_sum)
knitr::kable(summarytableTS, align ='c', digits = 0, format='latex',
col.names=c("Year", "N", "Mean Income (Euros)", "Median Income (Euros)", "SD of Income (Euros)",
"Mean Tax Paid (Euros)", "Median Tax Paid (Euros)", "SD of Tax Paid (Euros)",
"Mean Ave Tax Rate (%)", "Median Ave Tax Rate (%)", "SD of Ave Tax Rate (%)"))
# without time trend
M1 <- lm(log(avg_inc) ~ log(net_of_tax), data = cleaned)
summary(M1)
confint(M1)
# Durbin-Watson test for autocorrelation
dwt(M1)
# we have autocorr, so is it adviseable to use newey-west SE's??
# but we cannot figure out how to include this in stargazer
coeftest(M1,vcov=NeweyWest)
# including time trend
M2 <- lm(log(avg_inc) ~ log(net_of_tax) + year, data = cleaned)
summary(M2)
confint(M2)
# Durbin-Watson test for autocorrelation
dwt(M2)
# we have autocorr, so is it adviseable to use newey-west SE's??
# R will not calculate the NeweyWest errors due to matrix problems is it our data or R?
coeftest(M2,vcov=NeweyWest)
NeweyWest(M2)
#################
# Panel Data
#################
# from loop_best2.R
load("clean5obs.RData")
panel <- clean5obs
# here we are creating time dummies
panel <- mutate(panel, t11 = ifelse(year == 2011, total_inc, 0))
panel <- mutate(panel, t11 = max(t11))
panel <- mutate(panel, t12 = ifelse(year == 2012, total_inc, 0))
panel <- mutate(panel, t12 = max(t12))
panel <- mutate(panel, t13 = ifelse(year == 2013, total_inc, 0))
panel <- mutate(panel, t13 = max(t13))
# we are creating our dependent variable here
panel <- mutate(panel, dep1 = t11/total_inc)
panel <- mutate(panel, dep2 = t12/total_inc)
panel <- mutate(panel, dep3 = t13/total_inc)
panel <- mutate(panel, dep1a = ifelse(year == 2009, dep1, 1))
panel <- mutate(panel, dep2a = ifelse(year == 2010, dep2, 1))
panel <- mutate(panel, dep3a = ifelse(year == 2011, dep3, 1))
# our dependent variable is total income in t+2 divided by income in t
# multiplying these three variables will do just that
panel <- mutate(panel, dep = dep1a*dep2a*dep3a)
# here we are creating our independent variable
panel <- mutate(panel, tax11 = ifelse(year == 2011, net_of_tax, 1))
panel <- mutate(panel, tax11 = max(tax11))
panel <- mutate(panel, tax12 = ifelse(year == 2012, net_of_tax, 1))
panel <- mutate(panel, tax12 = max(tax12))
panel <- mutate(panel, tax13 = ifelse(year == 2013, net_of_tax, 1))
panel <- mutate(panel, tax13 = max(tax13))
panel <- mutate(panel, indep1 = tax11/net_of_tax)
panel <- mutate(panel, indep2 = tax12/net_of_tax)
panel <- mutate(panel, indep3 = tax13/net_of_tax)
panel <- mutate(panel, indep1a = ifelse(year == 2009, indep1, 1))
panel <- mutate(panel, indep2a = ifelse(year == 2010, indep2, 1))
panel <- mutate(panel, indep3a = ifelse(year == 2011, indep3, 1))
# our dependent variable is average tax rate in t+2 divided by ave tax rate in t
# multiplying these three variables will do just that
panel <- mutate(panel, indep = indep1a*indep2a*indep3a)
# with our time conversion, we end up only having 3 time periods, so 2012 and 2013 are unnecessary
panelmodel2 <- panel[!(panel$year==2012 | panel$year==2013),]
# duplicates?
# we identified a few strange things in our panel data when looking at persons names
# namely, we found duplicate names in the same year (with different income information)
# it's possible someone could have the same name, but just to be sure, we dropped those who appear
# in multiple years
dup <- panelmodel2[c("justname", "year")]
duplicated(dup)
# we have 3 duplicates
nodup <- dup[!duplicated(dup[c("justname", "year")]),]
# these are the three duplicates
dup2 <- dup[duplicated(dup[c("justname", "year")]),]
bad <- filter(panelmodel2, dep == 0 | indep == 0)
# 3 additional id's are bad because of strange data, we should not get 0, it's impossible by definition
bad <- filter(panelmodel2, id2==141 | id2==934 | id2==2263 | id2==1342 | id2==1827 | id2==2535)
# dropping these observations, we cannot tell if they are errors from the website table or not
panelmodel3 <- panelmodel2[!(panelmodel2$id2==141 | panelmodel2$id2==934 | panelmodel2$id2==2263 | panelmodel2$id2==1342 | panelmodel2$id2==1827 | panelmodel2$id2==2535),]
save(panelmodel3, file = "/Users/Kyle/Dropbox/!Fall_2014/Collab_Data/Final_Project/panelmodel3.RData")
#pooled OLS
ols1 <- lm(log(dep) ~ log(indep), data=panelmodel3)
summary(ols1)
confint(ols1)
# Durbin-Watson test for autocorrelation
dwt(ols1)
# unclear if we have strong autocorr
panelmodel3$id2 <- as.character(panelmodel3$id2)
# still can't get the panel to run using id2, using justname from now on
# Fixed Effects
fixed <- plm(log(dep) ~ log(indep), data=panelmodel3, index=c("justname", "year"), model="within")
summary(fixed)
confint(fixed)
# Testing for fixed effects, null: OLS better than fixed
pFtest(fixed, ols1)
#BP LM test of cross-sectional dependence
pcdtest(fixed, test = c("lm"))
# yes cross-sectional dependence?
# testing for serial correlation
pbgtest(fixed)
# yes serial correlation?
# testing for heterosk, BP test
bptest(log(dep) ~ log(indep) + factor(justname), data = panelmodel3, studentize=F)