Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit cbc08cb

Browse files
committedNov 20, 2015
Adding a snapshot of python scripts from phdthesis.git to public repo
1 parent 916e63e commit cbc08cb

20 files changed

+9121
-0
lines changed
 

‎README.md ‎README.rst

File renamed without changes.

‎chapter_2+3_theoryCA_networkstructure.py

+846
Large diffs are not rendered by default.

‎chapter_4_probable_improbable.py

+532
Large diffs are not rendered by default.

‎chapter_5_productfragmentation-baci92.py

+2,943
Large diffs are not rendered by default.

‎chapter_5_productfragmentation-wits-sitcr3.py

+553
Large diffs are not rendered by default.

‎chapter_x_appendix_A_dataset.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""
2+
Chapter: Appendix A Dataset
3+
===========================
4+
5+
This contains tables, plots and analysis used in the construction of the Dataset Appendix Chapter
6+
7+
NES and Non-Country Areas => Percentage of Trade Data Contained in Non Country ISO3C Codes
8+
NBER AX Codes as a % of World Exports => Percentage of Trade Data Contained in AX Product Codes
9+
10+
"""
11+
import re
12+
import pandas as pd
13+
import matplotlib.pyplot as plt
14+
15+
from dataset_info import TARGET_RAW_DIR, CHAPTER_RESULTS
16+
17+
RESULTS_DIR = CHAPTER_RESULTS["A"]
18+
19+
#------#
20+
#-NBER-#
21+
#------#
22+
23+
#-Setup-#
24+
#-------#
25+
source_dir = TARGET_RAW_DIR["nber"]
26+
for year in xrange(1962,2000+1,1):
27+
if year == 1962:
28+
source_data = pd.read_hdf(source_dir+"nber_year.h5", 'Y%s'%year)
29+
else:
30+
source_data = source_data.append(pd.read_hdf(source_dir+"nber_year.h5", 'Y%s'%year))
31+
32+
#-World Values-#
33+
world_values = source_data.loc[(source_data.importer == "World") & (source_data.exporter == "World")]
34+
world_values = world_values.groupby("year").sum()["value"]
35+
36+
37+
#-NES and Non-Country Areas-#
38+
#---------------------------#
39+
plt.clf()
40+
from pyeconlab.trade.dataset.NBERWTF.meta import countryname_to_iso3c
41+
data = source_data.copy()
42+
data = data.loc[(data.importer != "World") & (data.exporter != "World")].reset_index()
43+
data["EC"] = data["exporter"].apply(lambda x: countryname_to_iso3c[x])
44+
data["IC"] = data["importer"].apply(lambda x: countryname_to_iso3c[x])
45+
data["NC"] = data[["EC", "IC"]].apply(lambda row: 1 if (row["EC"]==".")|(row["IC"]==".") else 0, axis=1)
46+
nc_data = data.loc[data.NC == 1].groupby("year").sum()["value"]
47+
#-Percentage-#
48+
#~Trade~#
49+
result = nc_data.div(world_values)*100
50+
describe = result.describe()
51+
describe.to_csv(RESULTS_DIR + "nber_notcountry_percent_world_trade_table.csv")
52+
pd.DataFrame(describe).to_latex(RESULTS_DIR + "nber_notcountry_percent_world_trade_table.tex")
53+
ax = result.plot(title="NES Trade Flows [% of World Trade]", yticks=[0,1,2,3,4,5])
54+
ax.set_ylabel("Percent of World Trade")
55+
ax.set_xlabel("Year")
56+
plt.savefig(RESULTS_DIR + "nber_notcountry_percent_world_trade_plot.pdf")
57+
plt.clf()
58+
#~Export~#
59+
data_export = data.groupby(["year", "exporter", "EC"]).sum().reset_index()
60+
nc_data_export = data_export.loc[data_export.EC == "."].groupby("year").sum()["value"]
61+
result = nc_data_export.div(world_values)*100
62+
describe = result.describe()
63+
describe.to_csv(RESULTS_DIR + "nber_notcountry_percent_world_export_table.csv")
64+
pd.DataFrame(describe).to_latex(RESULTS_DIR + "nber_notcountry_percent_world_export_table.tex")
65+
ax = result.plot(title="NES Export Flows [% of World Trade]")
66+
ax.set_ylabel("Percent of World Trade")
67+
ax.set_xlabel("Year")
68+
plt.savefig(RESULTS_DIR + "nber_notcountry_percent_world_export_plot.pdf")
69+
plt.clf()
70+
#~Import ... Not Required~#
71+
del data, nc_data, data_export, nc_data_export
72+
73+
74+
#-NBER AX Codes as a % of World Exports-#
75+
#---------------------------------------#
76+
data = source_data.copy()
77+
data["AX"] = data["sitc4"].apply(lambda x: 1 if re.search("[aAxX]", x) else 0)
78+
AX = data.loc[data.AX == 1]
79+
AX = AX.groupby("year").sum()["value"]
80+
#-Percentage-#
81+
result = AX.div(world_values)*100
82+
describe = result.describe()
83+
describe.to_csv(RESULTS_DIR + "nber_ax_percent_world_trade_table.csv")
84+
pd.DataFrame(describe).to_latex(RESULTS_DIR + "nber_ax_percent_world_trade_table.tex")
85+
ax = result.plot(title="AX [% of World Trade]", )
86+
ax.set_ylabel("Percent of World Trade")
87+
ax.set_xlabel("Year")
88+
plt.savefig(RESULTS_DIR + "nber_ax_percent_world_trade_plot.pdf")
89+
del data, AX

‎chapter_x_appendix_B_RCAmeasures.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""
2+
Appendix B: Revealed Comparative Advantage
3+
==========================================
4+
5+
Analysis of Revealed Comparative Advantage Indicators
6+
7+
"""
8+
9+
#------------------------------------------------------------------------------#
10+
#-Compare RCA (Endogenously Computed Values with World Values) using NBER Data-#
11+
#------------------------------------------------------------------------------#
12+
13+
# IPYTHON NOTEBOOK:
14+
# ./AppendixC/RCAStudy-NBER2000-TotalWorldValues-Vs-CompleteNetwork.ipynb
15+
16+
17+
#-Compare RCA Metrics-#
18+
#-Balassa (1965) and Yu (2009)-#
19+
20+
# IPYTHON NOTEBOOK
21+
# ./AppendixC/RCAStudy-NBER-Compare-RCA-Metrics.ipynb
+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""
2+
Appendix G: Symmetric Vs. Assymetric Proximity Matrices
3+
4+
Year 2000 - NBER DATA
5+
6+
"""
7+
8+
import pandas as pd
9+
import numpy as np
10+
import matplotlib.pyplot as plt
11+
12+
from pyeconlab import DynamicProductLevelExportSystem
13+
14+
#-Local Imports-#
15+
from dataset_info import TARGET_DATASET_DIR, CHAPTER_RESULTS
16+
DATASET_DIR = TARGET_DATASET_DIR['nber']
17+
RESULTS_DIR = CHAPTER_RESULTS["G"]
18+
19+
data = pd.read_hdf(DATASET_DIR+"nber-export-sitcr2l4-1962to2000.h5", "D")
20+
data = data.rename(columns={'eiso3c' : 'country', 'sitc4' : 'productcode', 'value' : 'export'})
21+
data = data.set_index(["year"])
22+
system = DynamicProductLevelExportSystem()
23+
system.from_df(data)
24+
25+
#-Year 2000-#
26+
ys = system[2000]
27+
ys.rca_matrix(complete_data=True)
28+
ys.mcp_matrix()
29+
ys.compute_pci()
30+
ys.auto_adjust_pci_sign()
31+
pci = ys.pci.copy()
32+
33+
#-Example Proximity Values-#
34+
from pyeconlab.trade.classification import SITCR2
35+
sitc_to_name = SITCR2().code_description_dict()
36+
prox1 = ys.proximity_matrix()
37+
products1 = ["8423", "0711"]
38+
products2 = ["0611", "2927", "8451", "7810", "8441", "6584", "7924"]
39+
exval = prox1.filter(items=products1, axis=0).filter(items=products2, axis=1).T.unstack().to_frame(name="Proximity")
40+
exval.index.names = ["P1", "P2"]
41+
exval = exval.groupby(level="P1").apply(lambda x: x.sort(columns="Proximity", ascending=False))
42+
exval.index = exval.index.droplevel()
43+
exval = exval.reset_index()
44+
exval["P1 Description"] = exval["P1"].apply(lambda x: sitc_to_name[x])
45+
exval["P2 Description"] = exval["P2"].apply(lambda x: sitc_to_name[x])
46+
exval = exval.set_index(["P1","P1 Description","P2","P2 Description"])
47+
exval.to_excel(RESULTS_DIR+"proximity-examples-yr2000-nber-datasetD.xlsx")
48+
49+
50+
#-Symmetric Proximity Analysis-#
51+
prox1 = ys.compute_proximity(matrix_type='symmetric')
52+
fig1 = ys.plot_proximity(prox_cutoff=0.6, sortby=pci, sortby_text="PCI", step=15)
53+
ax = fig1.gca()
54+
ax.set_title("Symmetric Proximity Matrix [Yr: 2000]")
55+
plt.savefig(RESULTS_DIR + "proximity-symmetric-yr2000-nber-datasetD.png" , dpi=600)
56+
57+
products1 = ["8423", "8441", "8451", "6584"]
58+
products2 = ["8423", "8441", "8451", "6584"]
59+
exval = prox1.filter(items=products1, axis=0).filter(items=products2, axis=1).T.unstack().to_frame(name="Proximity")
60+
exval.index.names = ["P1", "P2"]
61+
exval = exval.groupby(level="P1").apply(lambda x: x.sort(columns="Proximity", ascending=False))
62+
exval.index = exval.index.droplevel()
63+
exval = exval.reset_index()
64+
exval = exval.set_index(["P1","P2"]).unstack()
65+
exval = exval.reset_index()
66+
exval["P1 Description"] = exval["P1"].apply(lambda x: sitc_to_name[x])
67+
exval = exval.set_index(["P1", "P1 Description"])
68+
exval.to_excel(RESULTS_DIR+"proximity-symmetric-examples-yr2000-nber-datasetD.xlsx")
69+
70+
71+
#------------------------------#
72+
#-Asymetric Proximity Analysis-#
73+
#------------------------------#
74+
prox2 = ys.compute_proximity(matrix_type="asymmetric")
75+
fig2 = ys.plot_proximity(prox_cutoff=0.6, sortby=pci, sortby_text="PCI", step=15)
76+
ax = fig2.gca()
77+
ax.set_title("Asymmetric Proximity Matrix [Yr: 2000]")
78+
plt.savefig(RESULTS_DIR + "proximity-asymmetric-yr2000-nber-datasetD-value-examples.png" , dpi=600)
79+
80+
products1 = ["8423", "8441", "8451", "6584"]
81+
products2 = ["8423", "8441", "8451", "6584"]
82+
exval = prox2.filter(items=products1, axis=0).filter(items=products2, axis=1).T.unstack().to_frame(name="Proximity")
83+
exval.index.names = ["P1", "P2"]
84+
exval = exval.groupby(level="P1").apply(lambda x: x.sort(columns="Proximity", ascending=False))
85+
exval.index = exval.index.droplevel()
86+
exval = exval.reset_index()
87+
exval = exval.set_index(["P1","P2"]).unstack()
88+
exval = exval.reset_index()
89+
exval["P1 Description"] = exval["P1"].apply(lambda x: sitc_to_name[x])
90+
exval = exval.set_index(["P1", "P1 Description"])
91+
exval.to_excel(RESULTS_DIR+"proximity-asymmetric-yr2000-nber-datasetD-value-examples.xlsx")
92+
93+
#-Histogram Comparing Symmetric and Asymmetric Proximity-#
94+
s1 = prox1.unstack()
95+
s1 = s1.apply(lambda x: np.nan if x == 1 else x)
96+
s1 = s1.apply(lambda x: np.nan if x == 0 else x)
97+
s1v = s1.values
98+
bins = np.linspace(0,1,50)
99+
plt.hist(s1v, bins, alpha=0.5, label="Symmetric")
100+
s2 = prox2.unstack()
101+
s2 = s2.apply(lambda x: np.nan if x == 1 else x)
102+
s2 = s2.apply(lambda x: np.nan if x == 0 else x)
103+
s2v = s2.values
104+
bins = np.linspace(0,1,50)
105+
plt.hist(s2v, bins, alpha=0.5, label="Asymmetric")
106+
plt.legend(loc="upper right")
107+
ax = plt.gca()
108+
ax.set_xlabel("Proximity Values")
109+
ax.set_ylabel("Frequency")
110+
plt.savefig(RESULTS_DIR+"proximity-symmetric-and-asymmetric-overlayedhistogram-yr2000-nber-datasetD.png", dpi=600)

‎dataset_analyse_baci.py

+195
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
"""
2+
Analyse Tables, Plots and Construct Meta Data for BACI Data
3+
"""
4+
5+
import os
6+
import gc
7+
import glob
8+
import matplotlib.pyplot as plt
9+
import pandas as pd
10+
11+
#-HS Levels-#
12+
HS96L6 = True
13+
14+
#-SITC Levels-#
15+
SITCR2L5 = True
16+
SITCR2L4 = True
17+
SITCR2L3 = True
18+
SITCR2L2 = True
19+
SITCR2L1 = True
20+
21+
#---------------#
22+
#-Control Logic-#
23+
#---------------#
24+
25+
RAW_SIMPLESTATS_TABLE = True
26+
27+
DATASET_PRODUCTCODE_INTERTEMPORAL_TABLES = True
28+
DATASET_COUNTRYCODE_INTERTEMPORAL_TABLES = True
29+
DATASET_SIMPLESTATS_TABLE = True
30+
DATASET_PERCENTWORLDTRADE_PLOTS = True
31+
32+
#-----#
33+
#-RAW-#
34+
#-----#
35+
36+
from dataset_info import RESULTS_DIR, TARGET_DATASET_DIR
37+
SOURCE_DIR = TARGET_DATASET_DIR["baci96"]
38+
STORE = "raw_baci_hs96-1998-2012.h5"
39+
RESULTS_DIR = RESULTS_DIR["baci96"]
40+
41+
if RAW_SIMPLESTATS_TABLE:
42+
43+
from pyeconlab.trade.util import describe
44+
45+
print "Running RAW_SIMPLESTATS_TABLE ..."
46+
47+
DIR = RESULTS_DIR + "tables/"
48+
STORE = SOURCE_DIR + STORE
49+
50+
print "Running STATS on File %s" % STORE
51+
store = pd.HDFStore(STORE)
52+
for dataset in sorted(store.keys()):
53+
dataset = dataset.strip("/") #Remove Directory Structure
54+
print "Computing SIMPLE STATS for dataset: %s" % dataset
55+
data = pd.read_hdf(STORE, key=dataset)
56+
productcode = "hs6"
57+
dataset_table = describe(data, table_name=dataset, productcode=productcode, exporter="eiso3n", importer="iiso3n")
58+
del data
59+
gc.collect()
60+
store.close()
61+
#-Excel Table-#
62+
fl = "baciraw-trade-hs6-1998to2012_stats.xlsx"
63+
dataset_table.to_excel(DIR + fl)
64+
#-Latex Snippet-#
65+
fl = "baciraw-trade-hs6-1998to2012_stats.tex"
66+
with open(DIR + fl, "w") as latex_file:
67+
latex_file.write(dataset_table.to_latex())
68+
69+
#----------#
70+
#-DATASETS-#
71+
#----------#
72+
73+
from dataset_info import RESULTS_DIR, TARGET_DATASET_DIR
74+
SOURCE_DIR = TARGET_DATASET_DIR["baci96"]
75+
STORES = glob.glob(SOURCE_DIR + "*.h5")
76+
RESULTS_DIR = RESULTS_DIR["baci96"]
77+
78+
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
79+
## ---> Product Composition Tables <--- ##
80+
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
81+
82+
STORES = [x for x in STORES if x.split("/")[-1][0:3] != "raw"] #Filter Out RAW Files
83+
84+
def split_filenames(fl):
85+
dataset, data_type, classification, years = fl.split("-")
86+
classification, product_level = classification[:-2], classification[-1:]
87+
return dataset, data_type, classification, product_level
88+
89+
if DATASET_PRODUCTCODE_INTERTEMPORAL_TABLES:
90+
91+
print "Running DATASET_PRODUCTCODE_INTERTEMPORAL_TABLES ..."
92+
93+
DIR = RESULTS_DIR + "intertemporal-productcodes/"
94+
95+
for store in STORES:
96+
print "Computing Composition Tables for: %s" % store
97+
dataset, data_type, classification, product_level = split_filenames(store.split("/")[-1])
98+
store = pd.HDFStore(store)
99+
for dataset in store.keys():
100+
print "Computing table for dataset: %s ..." % dataset
101+
dataset = dataset.strip("/")
102+
intertemp_product = store[dataset].groupby(["year", "sitc%s"%product_level]).sum().unstack("year")
103+
intertemp_product.columns = intertemp_product.columns.droplevel()
104+
intertemp_product.to_excel(DIR + "intertemporal_product_%s_%sl%s_%s.xlsx"%(data_type, classification, product_level, dataset))
105+
store.close()
106+
107+
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
108+
## ---> Country Composition Tables <--- ##
109+
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
110+
111+
if DATASET_COUNTRYCODE_INTERTEMPORAL_TABLES:
112+
113+
print "Running DATASET_COUNTRYCODE_INTERTEMPORAL_TABLES ..."
114+
115+
DIR = RESULTS_DIR + "intertemporal-countrycodes/"
116+
117+
for store in STORES:
118+
print "Computing Composition Tables for: %s" % store
119+
dataset, data_type, classification, product_level = split_filenames(store.split("/")[-1])
120+
store = pd.HDFStore(store)
121+
for dataset in store.keys():
122+
print "Computing table for dataset: %s ..." % dataset
123+
dataset = dataset.strip("/")
124+
if data_type == "export":
125+
intertemp_country = store[dataset].groupby(["year", "eiso3c"]).sum().unstack("year")
126+
if data_type == "import":
127+
intertemp_country = store[dataset].groupby(["year", "iiso3c"]).sum().unstack("year")
128+
else:
129+
continue
130+
intertemp_country.columns = intertemp_country.columns.droplevel()
131+
intertemp_country.to_excel(DIR + "intertemporal_country_%s_%s_%s.xlsx"%(data_type, classification, dataset))
132+
store.close()
133+
134+
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
135+
## ----> SIMPLE STATS TABLES <---- ##
136+
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
137+
138+
139+
if DATASET_SIMPLESTATS_TABLE:
140+
141+
from pyeconlab.trade.util import describe
142+
143+
print "Running DATASET_SIMPLESTATS_TABLE: ..."
144+
145+
DIR = RESULTS_DIR + "tables/"
146+
147+
for dataset_file in STORES:
148+
print "Running STATS on File %s" % dataset_file
149+
store = pd.HDFStore(dataset_file)
150+
for dataset in sorted(store.keys()):
151+
dataset = dataset.strip("/") #Remove Directory Structure
152+
print "Computing SIMPLE STATS for dataset: %s" % dataset
153+
data = pd.read_hdf(dataset_file, key=dataset)
154+
productcode = "".join(dataset_file.split("/")[-1].split("-")[2].split("r2l"))
155+
dataset_table = describe(data, table_name=dataset, productcode=productcode)
156+
if dataset == "A":
157+
table = dataset_table
158+
else:
159+
table = table.merge(dataset_table, left_index=True, right_index=True)
160+
store.close()
161+
#-Excel Table-#
162+
fl = dataset_file.split("/")[-1].split(".")[0] + "_stats" + ".xlsx"
163+
table.to_excel(DIR + fl)
164+
#-Latex Snippet-#
165+
fl = dataset_file.split("/")[-1].split(".")[0] + "_stats" + ".tex"
166+
with open(DIR + fl, "w") as latex_file:
167+
latex_file.write(table.to_latex())
168+
169+
170+
if DATASET_PERCENTWORLDTRADE_PLOTS:
171+
172+
print "DATASET_PERCENTWORLDTRADE_PLOTS ... "
173+
174+
DIR = RESULTS_DIR + "plots/percent_world_values/"
175+
176+
#-World Values-#
177+
fl = "./output/dataset/baci96/raw_baci_world_yearly-1998to2012.h5"
178+
world_values = pd.read_hdf(fl, key="World")["value"]
179+
180+
for dataset_file in STORES:
181+
print "Producing GRAPH on File %s" % dataset_file
182+
store = pd.HDFStore(dataset_file)
183+
datasets = store.keys()
184+
for dataset in sorted(datasets):
185+
print "Computing GRAPH for dataset: %s" % dataset
186+
data = pd.read_hdf(dataset_file, key=dataset)
187+
yearly_values = data.groupby(["year"]).sum()["value"]
188+
percent_values = yearly_values.div(world_values)*100
189+
fig = percent_values.plot(title="Dataset: %s (%s)"%(dataset, dataset_file))
190+
plt.savefig(DIR + "%s_%s_percent_wld.pdf"%(dataset, dataset_file.split('/')[-1].split('.')[0]))
191+
plt.close()
192+
store.close()
193+
194+
195+

‎dataset_analyse_nber.py

+1,037
Large diffs are not rendered by default.

‎dataset_analyse_nberbaci.py

+392
Large diffs are not rendered by default.

‎dataset_analyse_other.py

+210
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
"""
2+
Analyse Other Datasets
3+
======================
4+
5+
"atlas" -> Atlas of Complexity
6+
7+
"""
8+
9+
import gc
10+
import re
11+
import glob
12+
import pandas as pd
13+
import matplotlib.pyplot as plt
14+
15+
from dataset_info import RESULTS_DIR, TARGET_DATASET_DIR
16+
17+
#-Control-#
18+
19+
ATLAS = True
20+
21+
#-Atlas of Complexity-#
22+
if ATLAS:
23+
#-Setup Source-#
24+
SOURCE_DIR = TARGET_DATASET_DIR["atlas"]
25+
HS_STORES = glob.glob(SOURCE_DIR + "*_hs92_*.h5")
26+
SITC_STORES = glob.glob(SOURCE_DIR + "*_sitcr2_*.h5")
27+
RESULTS_DIR = RESULTS_DIR["atlas"]
28+
29+
#----------------------------------#
30+
#-ProductCode Intertemporal Tables-#
31+
#----------------------------------#
32+
33+
print
34+
print "[INFO] Computing ProductCode Intertemporal Tables ..."
35+
36+
DIR = RESULTS_DIR + "intertemporal-productcodes/"
37+
38+
#-SITC DATA-#
39+
for store in SITC_STORES:
40+
print "Analysing SITC File: %s ..." % store
41+
fln = store.split("/")[-1].split(".")[0]
42+
store = pd.HDFStore(store)
43+
for dataset in store.keys():
44+
print "Computing table for dataset: %s ..." % dataset
45+
dataset = dataset.strip("/")
46+
product_level = int(dataset[-1])
47+
intertemp_product = store[dataset].groupby(["year", "sitc%s"%product_level]).sum().unstack("year")
48+
intertemp_product.columns = intertemp_product.columns.droplevel()
49+
intertemp_product.to_excel(DIR + "%s_L%s.xlsx"%(fln, product_level))
50+
store.close()
51+
52+
#-HS DATA-#
53+
for store in HS_STORES:
54+
print "Analysing HS File: %s ..." % store
55+
fln = store.split("/")[-1].split(".")[0]
56+
store = pd.HDFStore(store)
57+
for dataset in store.keys():
58+
print "Computing table for dataset: %s ..." % dataset
59+
dataset = dataset.strip("/")
60+
product_level = int(dataset[-1])
61+
intertemp_product = store[dataset].groupby(["year", "hs%s"%product_level]).sum().unstack("year")
62+
intertemp_product.columns = intertemp_product.columns.droplevel()
63+
intertemp_product.to_excel(DIR + "%s_L%s.xlsx"%(fln, product_level))
64+
store.close()
65+
66+
#----------------------------------#
67+
#-CountryCode Intertemporal Tables-#
68+
#----------------------------------#
69+
70+
print
71+
print "[INFO] Computing CountryCode Intertemporal Tables ..."
72+
73+
DIR = RESULTS_DIR + "intertemporal-countrycodes/"
74+
75+
#-SITC-#
76+
for store in SITC_STORES:
77+
print "Analysing SITC File: %s ..." % store
78+
fln = store.split("/")[-1].split(".")[0]
79+
store = pd.HDFStore(store)
80+
for dataset in store.keys():
81+
print "Computing table for dataset: %s ..." % dataset
82+
product_level = int(dataset[-1])
83+
if product_level != 4:
84+
continue
85+
dataset = dataset.strip("/")
86+
if re.search("export", fln):
87+
print "[INFO] Export Data"
88+
intertemp_country = store[dataset].groupby(["year", "eiso3c"]).sum().unstack("year")
89+
elif re.search("import", fln):
90+
print "[INFO] Import Data"
91+
intertemp_country = store[dataset].groupby(["year", "iiso3c"]).sum().unstack("year")
92+
else:
93+
continue
94+
intertemp_country.columns = intertemp_country.columns.droplevel()
95+
intertemp_country.to_excel(DIR + "%s.xlsx"%(fln))
96+
store.close()
97+
98+
#-HS DATA-#
99+
for store in HS_STORES:
100+
print "Analysing HS File: %s ..." % store
101+
fln = store.split("/")[-1].split(".")[0]
102+
store = pd.HDFStore(store)
103+
for dataset in store.keys():
104+
print "Computing table for dataset: %s ..." % dataset
105+
dataset = dataset.strip("/")
106+
if re.search("export", fln):
107+
print "[INFO] Export Data"
108+
intertemp_country = store[dataset].groupby(["year", "eiso3c"]).sum().unstack("year")
109+
elif re.search("import", fln):
110+
print "[INFO] Import Data"
111+
intertemp_country = store[dataset].groupby(["year", "iiso3c"]).sum().unstack("year")
112+
else:
113+
continue
114+
intertemp_country.columns = intertemp_country.columns.droplevel()
115+
intertemp_country.to_excel(DIR + "%s.xlsx"%(fln))
116+
store.close()
117+
118+
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
119+
## ----> SIMPLE STATS TABLES <---- ##
120+
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
121+
122+
123+
from pyeconlab.trade.util import describe
124+
125+
print "Running DATASET_SIMPLESTATS_TABLE: ..."
126+
127+
DIR = RESULTS_DIR + "tables/"
128+
129+
#-SITC DATA-#
130+
131+
for dataset_file in SITC_STORES:
132+
print "Running (SITC) STATS on File %s" % dataset_file
133+
store = pd.HDFStore(dataset_file)
134+
for dataset in sorted(store.keys()):
135+
product_level = dataset.strip("/") #Remove Directory Structure
136+
print "Computing SIMPLE STATS for dataset: %s" % product_level
137+
data = pd.read_hdf(dataset_file, key=dataset)
138+
productcode = "sitc%s"%(product_level[-1])
139+
dataset_table = describe(data, table_name=product_level, productcode=productcode)
140+
#-Memory Reduction-#
141+
del data
142+
gc.collect()
143+
if product_level == "L1":
144+
table = dataset_table
145+
else:
146+
table = table.merge(dataset_table, left_index=True, right_index=True)
147+
store.close()
148+
#-Excel Table-#
149+
fl = dataset_file.split("/")[-1].split(".")[0] + "_stats" + ".xlsx"
150+
table.to_excel(DIR + fl)
151+
#-Latex Snippet-#
152+
fl = dataset_file.split("/")[-1].split(".")[0] + "_stats" + ".tex"
153+
with open(DIR + fl, "w") as latex_file:
154+
latex_file.write(table.to_latex())
155+
156+
#-HS DATA-#
157+
158+
for dataset_file in HS_STORES:
159+
print "Running (HS) STATS on File %s" % dataset_file
160+
store = pd.HDFStore(dataset_file)
161+
for dataset in sorted(store.keys()):
162+
product_level = dataset.strip("/") #Remove Directory Structure
163+
print "Computing SIMPLE STATS for dataset: %s" % product_level
164+
data = pd.read_hdf(dataset_file, key=dataset)
165+
productcode = "hs%s"%(product_level[-1])
166+
dataset_table = describe(data, table_name=product_level, productcode=productcode)
167+
#-Memory Reduction-#
168+
del data
169+
gc.collect()
170+
if product_level == "L1":
171+
table = dataset_table
172+
else:
173+
table = table.merge(dataset_table, left_index=True, right_index=True)
174+
store.close()
175+
#-Excel Table-#
176+
fl = dataset_file.split("/")[-1].split(".")[0] + "_stats" + ".xlsx"
177+
table.to_excel(DIR + fl)
178+
#-Latex Snippet-#
179+
fl = dataset_file.split("/")[-1].split(".")[0] + "_stats" + ".tex"
180+
with open(DIR + fl, "w") as latex_file:
181+
latex_file.write(table.to_latex
182+
183+
#-------#
184+
#-PLOTS-#
185+
#-------#
186+
187+
#-Intertemporal Number of Positive Productcodes-#
188+
189+
DIR = RESULTS_DIR + "plots/intertemporal-productcodes-num/"
190+
191+
for dataset_file in SITC_STORES:
192+
print "Running (SITC) PLOTS on File %s" % dataset_file
193+
store = pd.HDFStore(dataset_file)
194+
for dataset in sorted(store.keys()):
195+
product_level = dataset.strip("/") #Remove Directory Structure
196+
print "Computing PLOT for dataset: %s" % product_level
197+
data = pd.read_hdf(dataset_file, key=dataset)
198+
productcode = "sitc%s"%(product_level[-1])
199+
if re.search("rca", dataset_file):
200+
value = "rca"
201+
else:
202+
value = "value"
203+
data_year = data.groupby(["year", productcode], as_index=False).sum().groupby("year").apply(lambda row: row[value].count())
204+
fig = data_year.plot(title="Dataset: %s (%s)"%(dataset, dataset_file))
205+
plt.savefig(DIR + "%s_%s_numproducts.pdf"%(dataset_file.split('/')[-1].split('.')[0], product_level))
206+
plt.close()
207+
#-Memory Reduction-#
208+
del data, data_year
209+
gc.collect()
210+
store.close()

‎dataset_compile_raw.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
"""
2+
Compile RAW Data into a Single Data File
3+
========================================
4+
5+
Author: Matthew McKay (mamckay@gmail.com)
6+
7+
This script compiles and converts (where necessary) raw data into a single data file
8+
9+
Sources
10+
-------
11+
[1] NBER
12+
[2] BACI
13+
14+
Notes
15+
-----
16+
1. Care must be taken when working with CSV files.
17+
a. After 1984 there are some float values in 'value' column
18+
b. icode, ecode, sitc4 need to be imported explicitly as strings
19+
2. Stata and HDF are more similar in type when compared with CSV. Both have the presence of "" which should be replaced with np.nan
20+
3. Unit and Quantity information is only available after 1984 in NBER Dataset
21+
4. HDF are both compact and fast and therefore should be used as the standard file source for dataset objects
22+
23+
"""
24+
25+
import sys
26+
import pandas as pd
27+
import csv
28+
import numpy as np
29+
30+
#-Dataset Information-#
31+
from dataset_info import SOURCE_DIR, TARGET_RAW_DIR
32+
33+
#------#
34+
#-NBER-#
35+
#------#
36+
37+
#-Convert Each year to CSV File-#
38+
def nber_convert_dta_to_csv(source_dir, target_dir):
39+
for year in range(1962, 2000+1, 1):
40+
fn = source_dir + "wtf%s.dta" % str(year)[2:]
41+
print "Loading Year: %s from file: %s" % (year, fn)
42+
data = pd.read_stata(fn)
43+
fn = target_dir + "wtf%s.csv" % str(year)[2:]
44+
print "Converting Year: %s from file: %s" % (year, fn)
45+
data.to_csv(fn, index=False, quoting=csv.QUOTE_NONNUMERIC)
46+
print "Convert DTA to CSV Finished!"
47+
48+
#-Convert All Years to an HDF File-#
49+
def nber_convert_dta_to_hdf(source_dir, target_dir, index='year'):
50+
if index == 'year':
51+
fn = target_dir + "nber_year.h5"
52+
store = pd.HDFStore(fn, complevel=9, complib='zlib')
53+
for year in range(1962, 2000+1, 1):
54+
fn = source_dir + "wtf%s.dta" % str(year)[2:]
55+
print "Loading Year: %s from file: %s" % (year, fn)
56+
data = pd.read_stata(fn)
57+
store.put('Y'+str(year), data, format='table')
58+
print "HDF File Saved ..."
59+
print store
60+
store.close()
61+
else:
62+
data = pd.DataFrame()
63+
for year in range(1962, 2000+1, 1):
64+
fn = source_dir + "wtf%s.dta" % str(year)[2:]
65+
print "Loading Year: %s from file: %s" % (year, fn)
66+
data = data.append(pd.read_stata(fn))
67+
fn = target_dir + "nber.h5"
68+
store = pd.HDFStore(fn, complevel=9, complib='zlib')
69+
store.put('nber', data, format='table')
70+
print "HDF File Saved ..."
71+
print store
72+
store.close()
73+
print "Convert DTA to HDF Finished!"
74+
75+
#-Convert NBER supplementary Data-#
76+
def nber_supp_convert_dta_to_hdf(source_dir, target_dir):
77+
"""
78+
Save NBER supplementary data into an HDF file "nber_supp_year.hdf"
79+
"""
80+
fn = target_dir + "nber_supp_year.h5"
81+
store = pd.HDFStore(fn, complevel=9, complib='zlib')
82+
for year in xrange(1988, 2000+1, 1):
83+
fn = source_dir + "china_hk%s.dta" % str(year)[2:]
84+
print "[NBER-SUPP] Loading Year: %s from file: %s" % (year, fn)
85+
data = pd.read_stata(fn)
86+
store.put('Y'+str(year), data, format='table')
87+
print "HDF file Saved ..."
88+
print store
89+
store.close()
90+
91+
92+
#------#
93+
#-BACI-#
94+
#------#
95+
96+
#-Convert All CSV Year Files to an HDF File-#
97+
def baci_convert_dta_to_hdf(source_dir, target_dir):
98+
fn = target_dir + "baci_year.h5"
99+
store = pd.HDFStore(fn, complevel=9, complib='zlib')
100+
for year in range(1998, 2012+1, 1):
101+
fn = source_dir + "baci96_%s.csv" % str(year)
102+
print "Loading Year: %s from file: %s" % (year, fn)
103+
data = pd.read_csv(fn, dtype={'hs6' : str})
104+
store.put('Y'+str(year), data, format='table')
105+
print "HDF File Saved"
106+
print store
107+
store.close()
108+
print "Convert CSV to HDF Finished!"
109+
110+
#-Raw Data Conversions and Comparisons-#
111+
112+
if __name__ == "__main__":
113+
114+
#-Execution Settings-#
115+
NBER=True
116+
dta_to_csv = False # Using HDF as Key DataStructure Due to it's size and speed advantage
117+
dta_to_hdf = True # Data Structure of Choice
118+
119+
BACI=True
120+
csv_to_hdf = True
121+
122+
#-Convert NBER-#
123+
if NBER:
124+
source_dir = SOURCE_DIR['nber']
125+
target_dir = TARGET_RAW_DIR['nber']
126+
#-Conversions-#
127+
if dta_to_csv:
128+
print "Convert dta to csv files"
129+
nber_convert_dta_to_csv(source_dir, target_dir)
130+
if dta_to_hdf:
131+
print "Convert dta to hdf file"
132+
nber_convert_dta_to_hdf(source_dir, target_dir, index='year')
133+
nber_supp_convert_dta_to_hdf(source_dir, target_dir)
134+
135+
#-Convert BACI-#
136+
if BACI:
137+
source_dir = SOURCE_DIR['baci96']
138+
target_dir = TARGET_RAW_DIR['baci96']
139+
#-Conversions-#
140+
if csv_to_hdf:
141+
print "Convert csv to hdf file"
142+
baci_convert_dta_to_hdf(source_dir, target_dir)

‎dataset_construct_baci.py

+202
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
"""
2+
Compute BACI Datasets
3+
=====================
4+
5+
Author: Matthew McKay (mamckay@gmail.com)
6+
7+
Filename rules: {{ source }}-{{ flow }}-{{ classification }}-{{ years }}-{{ raw/cleaned }}-{{ type }}-{{ id }}
8+
source-flow-classification-years-raw/cleaned-type-id
9+
10+
Supporting Scripts
11+
------------------
12+
1. dataset-info.py Contains Information about the relevant datasets
13+
2. dataset-compile-raw.py Compiles RAW data files to a single dataset file
14+
15+
Sources
16+
-------
17+
2. baci
18+
md5: e988b6544563675492b59f397a8cb6bb
19+
notes: BACI Trade RAW Dataset [HS96]
20+
21+
Supporting Files
22+
----------------
23+
TBD
24+
25+
"""
26+
27+
import numpy as np
28+
import pandas as pd
29+
from pyeconlab.util import concord_data
30+
import gc
31+
32+
#----------#
33+
#- BACI96 -#
34+
#----------#
35+
36+
#-Dataset Information-#
37+
from dataset_info import TARGET_RAW_DIR, TARGET_DATASET_DIR, YEARS
38+
39+
#-Setup Local Environment-#
40+
#~~~~~~~~~~~~~~~~~~~~~~~~~#
41+
SOURCE_DIR = TARGET_RAW_DIR['baci96']
42+
TARGET_DIR = TARGET_DATASET_DIR['baci96']
43+
start_year, end_year = YEARS['baci96']
44+
45+
#-Helper Functions-#
46+
#~~~~~~~~~~~~~~~~~~#
47+
48+
def load_raw_dataset(fn, start_year, end_year, verbose=True):
49+
"""
50+
Load Raw BACI Dataset
51+
"""
52+
data = pd.DataFrame()
53+
for year in range(start_year, end_year+1, 1):
54+
print "Loading Year: %s" % year
55+
data = data.append(pd.read_hdf(fn, "Y%s"%year))
56+
if verbose: print data.t.unique()
57+
return data
58+
59+
#-Source Information-#
60+
#~~~~~~~~~~~~~~~~~~~~#
61+
print
62+
print "---> Loading RAW Data <---"
63+
fn = SOURCE_DIR + "baci_year.h5"
64+
rawdata = load_raw_dataset(fn, start_year, end_year)
65+
66+
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
67+
#-Construct SITC Revision 2 Datasets-#
68+
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
69+
70+
CONSTRUCT_SITC_DATASETS = True
71+
72+
if CONSTRUCT_SITC_DATASETS:
73+
74+
from pyeconlab.trade.dataset.CEPIIBACI import SITC_DATASET_DESCRIPTION, SITC_DATASET_OPTIONS
75+
from pyeconlab.trade.dataset.CEPIIBACI import construct_sitc
76+
77+
LEVELS = [1,2,3,4,5]
78+
DATA_TYPES = ["trade", "export", "import"]
79+
80+
for level in LEVELS:
81+
#-Import this as a Function from pyeconlab-#
82+
print
83+
print "---> COMPUTING SITC REVISION 2 LEVEL %s DATASETS <---" % level
84+
print
85+
for data_type in DATA_TYPES:
86+
#-Setup Store-#
87+
fn = "baci-%s-sitcr2l%s-%sto%s.h5" % (data_type, level, start_year, end_year) #-Write File: {{ source }}-{{ flow }}-{{ classification }}-{{ years }}.h5-#
88+
store = pd.HDFStore(TARGET_DIR+fn, complevel=9, complib='zlib')
89+
#-Compute Datasets-#
90+
for dataset in sorted(SITC_DATASET_OPTIONS.keys()):
91+
print "[SITCR2L%s] Computing Dataset %s for %s" % (level, dataset, data_type)
92+
#-Compute Data-#
93+
#INTERFACE: def construct_sitc(data, data_classification, data_type, level, revision, check_concordance=True, adjust_units=False, concordance_institution="un", multiindex=True, verbose=True):#
94+
data = construct_sitc(rawdata.copy(deep=True), data_classification="HS96", data_type=data_type, level=level, revision=2, **SITC_DATASET_OPTIONS[dataset])
95+
store.put(dataset, data, format='table')
96+
store.get_storer(dataset).attrs.options = SITC_DATASET_OPTIONS[dataset]
97+
store.get_storer(dataset).attrs.data_type = data_type
98+
store.get_storer(dataset).attrs.description = SITC_DATASET_DESCRIPTION[dataset]
99+
print
100+
#-Close-#
101+
store.close()
102+
del data
103+
gc.collect()
104+
105+
#----------#
106+
#-RAW DATA-#
107+
#----------#
108+
109+
RAW_DATA = True
110+
RAW_WORLD_YEARLY = True
111+
RAW_COUNTRY_YEARLY = True
112+
RAW_PRODUCT_YEARLY = True
113+
114+
#-Adjust RAW Data to have common interface names-#
115+
stdnames = {'t' : 'year', 'i' : 'eiso3n', 'j' : 'iiso3n', 'v' : 'value', 'q' : 'quantity'}
116+
rawdata = rawdata.rename_axis(stdnames, axis=1)
117+
118+
if RAW_DATA:
119+
print
120+
print "---> SAVING RAW DATA (WITH STANDARD COLUMNS NAMES) <---"
121+
print
122+
fn = "raw_baci_hs96-1998-2012.h5"
123+
store = pd.HDFStore(TARGET_DIR+fn, complevel=9, complib='zlib')
124+
store.put('RAW', rawdata, format='table')
125+
store.close()
126+
127+
128+
if RAW_WORLD_YEARLY:
129+
130+
## Shold this be filtered through a countries only filter? ##
131+
132+
print
133+
print "---> COMPUTING WORLD YEARLY VALUES FROM RAW BACI DATASET <---"
134+
print
135+
fn = "raw_baci_world_yearly-1998to2012.h5"
136+
store = pd.HDFStore(TARGET_DIR+fn, complevel=9, complib='zlib')
137+
world_values = rawdata[["year", "value"]].groupby(["year"]).sum()
138+
store.put('World', world_values, format='table')
139+
store.close()
140+
del world_values
141+
gc.collect()
142+
143+
if RAW_COUNTRY_YEARLY:
144+
print
145+
print "---> COMPUTING COUNTRY YEARLY VALUES FROM RAW BACI DATASET <---"
146+
print
147+
#-Setup Store-#
148+
fn = "raw_baci_country_year-1998to2012.h5"
149+
store = pd.HDFStore(TARGET_DIR+fn, complevel=9, complib='zlib')
150+
#-Import ISO3C-#
151+
from pyeconlab.trade.dataset.CEPIIBACI.meta import hs96_iso3n_to_iso3c
152+
rawdata['eiso3c'] = rawdata['eiso3n'].apply(lambda x: concord_data(hs96_iso3n_to_iso3c, x, issue_error=np.nan)) #Is this Complete?
153+
rawdata['iiso3c'] = rawdata['iiso3n'].apply(lambda x: concord_data(hs96_iso3n_to_iso3c, x, issue_error=np.nan)) #Is this Complete?
154+
#-Country Exports-#
155+
exports = rawdata[["year", "eiso3c", "value"]].groupby(["year", "eiso3c"]).sum().reset_index()
156+
store.put("CountryExports", exports, format='table')
157+
#-Country Imports-#
158+
imports = rawdata[["year", "iiso3c", "value"]].groupby(["year", "iiso3c"]).sum().reset_index()
159+
store.put("CountryImports", imports, format='table')
160+
store.close()
161+
del exports
162+
del imports
163+
gc.collect()
164+
165+
if RAW_PRODUCT_YEARLY:
166+
167+
## Shold this be filtered through a countries only filter? ##
168+
169+
print
170+
print "---> COMPUTING PRODUCT YEAR VALUES FROM RAW BACI DATASET (HS and SITC)"
171+
print
172+
#-Setup Store-#
173+
fn = "raw_baci_product_year-1998to2012.h5"
174+
store = pd.HDFStore(TARGET_DIR+fn, complevel=9, complib='zlib')
175+
#-HS-#
176+
for level in [6,5,4,3,2,1]:
177+
print "Computing HS%s Product Year Values ..."%level
178+
data = rawdata.copy(deep=True)
179+
if level != 6:
180+
data["hs%s"%level] = data["hs6"].apply(lambda x: x[0:level])
181+
product_trade = data[["year", "hs%s"%level, "value"]].groupby(["year", "hs%s"%level]).sum().reset_index()
182+
store.put("HS96L%s"%level, product_trade, format='table')
183+
del data
184+
del product_trade
185+
gc.collect()
186+
187+
#-SITC-#
188+
from pyeconlab.trade.concordance import HS_To_SITC
189+
concordance = HS_To_SITC(hs="HS96", sitc="SITCR2", hs_level=6, sitc_level=5, source_institution='un', verbose=True).concordance
190+
for level in [5,4,3,2,1]:
191+
print "Computing SITC%s Product Year Values ..."%level
192+
data = rawdata.copy(deep=True)
193+
data['sitc5'] = data['hs6'].apply(lambda x: concord_data(concordance, x, issue_error=np.nan))
194+
if level != 5:
195+
data["sitc%s"%level] = data["sitc5"].apply(lambda x: x[0:level])
196+
product_trade = data[["year", "sitc%s"%level, "value"]].groupby(["year", "sitc%s"%level]).sum().reset_index()
197+
store.put("SITCR2L%s"%level, product_trade, format='table')
198+
del data
199+
del product_trade
200+
gc.collect()
201+
store.close()
202+

‎dataset_construct_nber.py

+700
Large diffs are not rendered by default.

‎dataset_construct_nber_options.py

+169
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
"""
2+
NBER DATASET CONSTRUCT OPTIONS
3+
"""
4+
5+
#-Dataset Configuration-#
6+
#~~~~~~~~~~~~~~~~~~~~~~~#
7+
8+
#-Future Work: Check this is Consistent with pyeconlab definitions-#
9+
10+
DATA_DESCRIPTION = {
11+
#-Country Datasets-#
12+
'A' : u"A basic dataset that incudes AX and SITCR2 indicators and collapses data to a specified level maintaining initial countrycodes and productcodes as in the raw dataset, removes NES",
13+
'B' : u"[A] except corrects HK-CHINA data from nber correction files",
14+
'C' : u"A dataset that does not contain AX, adjusts HK-CHINA data, but does not adjust products or countries for intertemporal consistency",
15+
'D' : u"A Dataset that does not contain AX or any non standard SITCR2 codes, adjusts HK-CHINA data, but does not adjust products or countries for intertemporal consistency",
16+
'E' : u"A Dataset that does not contain AX and updates productcodes to be more intertemporally consisted, adjusts HK-CHINA data, but does not adjust countries for intertemporal consistency",
17+
'F' : u"A dataset that does not contain AX and updates productcodes to be more intertemporally consisted, adjusts HK-CHINA data, and adjusts countries for intertemporal consistency",
18+
'G' : u"A dataset that does not contain AX or any non standard SITCR2 codes, adjusts HK-CHINA data, and adjusts country codes for intertemporal consistency",
19+
# 'H' : u"A dataset that does not contain AX and udpates productcodes to be more intertemporally consistent, adjusts HK-CHINA data, and adjusts country codes for intertemporaly consistency and drops non-complete countries (EXPERIMENTAL)",
20+
# 'I' : u"A dataset that does not contain AX or any non standard SITCR2 codes, adjusts HK-CHINA data, and drops countries that are not intertemporally complete (EXPERIMENTAL)",
21+
}
22+
23+
RAW_DATA_DESCRIPTION = {
24+
#-Raw Dataset Descriptions-#
25+
'RAW1' : u"Basic RAW dataset with iso3c countrycodes included and collapsed quantity disaggregation",
26+
'RAW2' : u"Basic RAW dataset with iso3c countrycodes included, collapsed quantity disaggregation, and adjusts HK-CHINA data",
27+
}
28+
29+
#-Data Option Definitions-#
30+
31+
DATA_OPTIONS = {
32+
'A' : {
33+
#-ProductCode Adjustments-#
34+
'AX' : True, #Add a Marker for 'A' and 'X' Codes
35+
'dropAX' : False, #Drops Products where Codes have 'A' or 'X'
36+
'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
37+
'drop_nonsitcr2' : False, #Removes Non-Official SITC Revision 2 Codes From the Dataset
38+
'adjust_hk' : False, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
39+
'intertemp_productcode' : False, #Compute an Intertemporal ProductCode
40+
#-CountryCode Adjustments-#
41+
'intertemp_cntrycode' : False, #Recode Country Codes to be Intertemporally Consistent
42+
'drop_incp_cntrycode' : False, #Drop Incomplete Intertemporal Countries
43+
#-Other Adjustments-#
44+
'adjust_units' : False,
45+
'source_institution' : 'un',
46+
'verbose' : True,
47+
},
48+
'B' : {
49+
'AX' : True, #Add a Marker for 'A' and 'X' Codes
50+
'dropAX' : False, #Drops Products where Codes have 'A' or 'X'
51+
'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
52+
'drop_nonsitcr2' : False, #Removes Non-Official SITC Revision 2 Codes From the Dataset
53+
'adjust_hk' : True, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
54+
'intertemp_productcode' : False, #Compute an Intertemporal ProductCode
55+
'intertemp_cntrycode' : False, #Recode Country Codes to be Intertemporally Consistent
56+
'drop_incp_cntrycode' : False, #Drop Incomplete Intertemporal Countries
57+
'adjust_units' : False,
58+
'source_institution' : 'un',
59+
'verbose' : True,
60+
},
61+
'C' : {
62+
'AX' : True, #Add a Marker for 'A' and 'X' Codes
63+
'dropAX' : True, #Drops Products where Codes have 'A' or 'X'
64+
'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
65+
'drop_nonsitcr2' : False, #Removes Non-Official SITC Revision 2 Codes From the Dataset
66+
'adjust_hk' : True, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
67+
'intertemp_productcode' : False, #Compute an Intertemporal ProductCode
68+
'intertemp_cntrycode' : False, #Recode Country Codes to be Intertemporally Consistent
69+
'drop_incp_cntrycode' : False, #Drop Incomplete Intertemporal Countries
70+
'adjust_units' : False,
71+
'source_institution' : 'un',
72+
'verbose' : True,
73+
},
74+
'D' : { #-!!-MAJOR-!!-#
75+
'AX' : True, #Add a Marker for 'A' and 'X' Codes
76+
'dropAX' : True, #Drops Products where Codes have 'A' or 'X'
77+
'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
78+
'drop_nonsitcr2' : True, #Removes Non-Official SITC Revision 2 Codes From the Dataset
79+
'adjust_hk' : True, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
80+
'intertemp_productcode' : False, #Compute an Intertemporal ProductCode
81+
'intertemp_cntrycode' : False, #Recode Country Codes to be Intertemporally Consistent
82+
'drop_incp_cntrycode' : False, #Drop Incomplete Intertemporal Countries
83+
'adjust_units' : False,
84+
'source_institution' : 'un',
85+
'verbose' : True,
86+
},
87+
'E' : { #-!!-MAJOR-!!-#
88+
'AX' : True, #Add a Marker for 'A' and 'X' Codes
89+
'dropAX' : True, #Drops Products where Codes have 'A' or 'X'
90+
'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
91+
'drop_nonsitcr2' : False, #Removes Non-Official SITC Revision 2 Codes From the Dataset
92+
'adjust_hk' : True, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
93+
'intertemp_productcode' : True, #Compute an Intertemporal ProductCode
94+
'intertemp_cntrycode' : False, #Recode Country Codes to be Intertemporally Consistent
95+
'drop_incp_cntrycode' : False, #Drop Incomplete Intertemporal Countries
96+
'adjust_units' : False,
97+
'source_institution' : 'un',
98+
'verbose' : True,
99+
},
100+
'F' : { #-!!-MAJOR-!!-#
101+
'AX' : True, #Add a Marker for 'A' and 'X' Codes
102+
'dropAX' : True, #Drops Products where Codes have 'A' or 'X'
103+
'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
104+
'drop_nonsitcr2' : False, #Removes Non-Official SITC Revision 2 Codes From the Dataset
105+
'adjust_hk' : True, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
106+
'intertemp_productcode' : True, #Compute an Intertemporal ProductCode
107+
'intertemp_cntrycode' : True, #Recode Country Codes to be Intertemporally Consistent
108+
'drop_incp_cntrycode' : False, #Drop Incomplete Intertemporal Countries
109+
'adjust_units' : False,
110+
'source_institution' : 'un',
111+
'verbose' : True,
112+
},
113+
'G' : {
114+
'AX' : True, #Add a Marker for 'A' and 'X' Codes
115+
'dropAX' : True, #Drops Products where Codes have 'A' or 'X'
116+
'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
117+
'drop_nonsitcr2' : True, #Removes Non-Official SITC Revision 2 Codes From the Dataset
118+
'adjust_hk' : True, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
119+
'intertemp_productcode' : False, #Compute an Intertemporal ProductCode
120+
'intertemp_cntrycode' : True, #Recode Country Codes to be Intertemporally Consistent
121+
'drop_incp_cntrycode' : False, #Drop Incomplete Intertemporal Countries
122+
'adjust_units' : False,
123+
'source_institution' : 'un',
124+
'verbose' : True,
125+
},
126+
# 'H' : { #-!!-EXPERIMENTAL-!!-#
127+
# 'AX' : True, #Add a Marker for 'A' and 'X' Codes
128+
# 'dropAX' : True, #Drops Products where Codes have 'A' or 'X'
129+
# 'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
130+
# 'drop_nonsitcr2' : False, #Removes Non-Official SITC Revision 2 Codes From the Dataset
131+
# 'adjust_hk' : True, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
132+
# 'intertemp_productcode' : True, #Compute an Intertemporal ProductCode
133+
# 'intertemp_cntrycode' : True, #Recode Country Codes to be Intertemporally Consistent
134+
# 'drop_incp_cntrycode' : True, #Drop Incomplete Intertemporal Countries
135+
# 'adjust_units' : False,
136+
# 'source_institution' : 'un',
137+
# 'verbose' : True,
138+
# },
139+
# 'I' : { #-!!-EXPERIMENTAL-!!-#
140+
# 'AX' : True, #Add a Marker for 'A' and 'X' Codes
141+
# 'dropAX' : True, #Drops Products where Codes have 'A' or 'X'
142+
# 'sitcr2' : True, #Adds an Official SITC Revision 2 Indicator
143+
# 'drop_nonsitcr2' : True, #Removes Non-Official SITC Revision 2 Codes From the Dataset
144+
# 'adjust_hk' : True, #Adjust Data to incorporate Honk Kong Adjusments provided by NBER
145+
# 'intertemp_productcode' : False, #Compute an Intertemporal ProductCode
146+
# 'intertemp_cntrycode' : False, #Recode Country Codes to be Intertemporally Consistent
147+
# 'drop_incp_cntrycode' : True, #Drop Incomplete Intertemporal Countries
148+
# 'adjust_units' : False,
149+
# 'source_institution' : 'un',
150+
# 'verbose' : True,
151+
# },
152+
}
153+
154+
155+
RAW_DATA_OPTIONS = {
156+
#-RAW includes NES, World etc. and Undertakes a Minimum of Changes to the Data to make it Comparable-#
157+
'RAW1' : {
158+
'adjust_hk' : False, #Adjust Hong Kong Data
159+
'harmonised_raw' : True, #Construct Harmonised RAW Data File (No Quantity Disaggregation, Common Names)
160+
#-Required Due to Script Logic Below-#
161+
'intertemp_productcode' : False,
162+
},
163+
'RAW2' : {
164+
'adjust_hk' : True, #Adjust Hong Kong Data
165+
'harmonised_raw' : True, #Construct Harmonised RAW Data File (No Quantity Disaggregation, Common Names)
166+
#-Required Due to Script Logic Below-#
167+
'intertemp_productcode' : False,
168+
},
169+
}

‎dataset_construct_nberbaci.py

+648
Large diffs are not rendered by default.

‎dataset_construct_other.py

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
"""
2+
Compilation of Other Useful Datasets
3+
====================================
4+
5+
1. World Development Indicators Dataset
6+
2. Atlas of Complexity Dataset
7+
3. Penn World Table Dataset
8+
9+
Construct h5 datasets and pyeconlab objects
10+
11+
"""
12+
13+
import os
14+
import gc
15+
import shutil
16+
import warnings
17+
import pandas as pd
18+
19+
from pyeconlab import WDI, CIDAtlasDataConstructor, PENN
20+
from dataset_info import SOURCE_DIR, TARGET_DATASET_DIR
21+
22+
#---------#
23+
#-Control-#
24+
#---------#
25+
26+
COMPILE_WDI = False
27+
COMPILE_ATLAS = False #-!!-Requires AWS for 'Trade'-!!-# 'Trade Disabled'
28+
COMPILE_PENN = True
29+
30+
#------------------------------#
31+
#-World Development Indicators-#
32+
#------------------------------#
33+
34+
if COMPILE_WDI:
35+
wdi = WDI(source_dir=SOURCE_DIR['wdi'])
36+
stata_wide_fln = wdi.to_stata(table_type="wide", target_dir=TARGET_DATASET_DIR['wdi']) # wdi_data_wide.dta
37+
stata_long_fln = wdi.to_stata(table_type="long", target_dir=TARGET_DATASET_DIR['wdi']) # wdi_data_long.dta
38+
hdf_fln = wdi.to_hdf(target_dir = TARGET_DATASET_DIR['wdi']) # wdi_data.h5
39+
40+
#-----------------------------#
41+
#-Atlas of Complexity Dataset-#
42+
#-----------------------------#
43+
44+
#-Countries Only Dataset-#
45+
46+
if COMPILE_ATLAS:
47+
#-Values-#
48+
print "[INFO] Processing VALUES Data ..."
49+
for classification in ["SITCR2", "HS92"]:
50+
print warnings.warn("This will not compile 'trade' data - just export and import data")
51+
for dtype in ["export", "import"]: # -- !! -- Excluding "trade" -- !! -- due to memory constraints -- use stata -- #
52+
print "Processing %s for %s data ..." % (classification, dtype)
53+
atlas = CIDAtlasDataConstructor(source_dir=SOURCE_DIR['atlas'], trade_classification=classification, dtype=dtype, reduce_memory=True)
54+
atlas.construct_standardized_dataset()
55+
#-Store-#
56+
startyear = atlas.dataset.year.min()
57+
endyear = atlas.dataset.year.max()
58+
fln = TARGET_DATASET_DIR["atlas"] + "cidatlas_%s_%s_%sto%s.h5"%(classification.lower(), dtype, startyear, endyear)
59+
store = pd.HDFStore(fln, complevel=9, complib='zlib')
60+
#-Country Value Data-#
61+
atlas.countries_only()
62+
#-Value-#
63+
for level in [4,3,2,1]:
64+
gc.collect()
65+
print "[INFO] Saving Level %s ... " % level
66+
if classification == "SITCR2":
67+
productid = "sitc%s"%level
68+
if dtype == "export":
69+
idx = ["year", "eiso3c", productid]
70+
elif dtype == "import":
71+
idx = ["year", "iiso3c", productid]
72+
else:
73+
idx = ["year", "eiso3c", "iiso3c", productid]
74+
if level != 4:
75+
atlas.dataset[productid] = atlas.dataset["sitc4"].apply(lambda x: x[0:level])
76+
if classification == "HS92":
77+
productid = "hs%s"%level
78+
if dtype == "export":
79+
idx = ["year", "eiso3c", productid]
80+
elif dtype == "import":
81+
idx = ["year", "iiso3c", productid]
82+
else:
83+
idx = ["year", "eiso3c", "iiso3c", productid]
84+
if level != 4:
85+
atlas.dataset[productid] = atlas.dataset["hs4"].apply(lambda x: x[0:level])
86+
#-Collapse Levels-#
87+
countrydata = atlas.dataset[idx+["value"]].groupby(idx, as_index=False).sum()
88+
store.put("L%s"%level, countrydata, format="table")
89+
del countrydata
90+
store.close()
91+
del atlas
92+
gc.collect()
93+
94+
#-RCA-#
95+
print "[INFO] Processing RCA Data ..."
96+
for classification in ["SITCR2", "HS92"]:
97+
for dtype in ["export", "import"]:
98+
print "Processing %s for %s data ..." % (classification, dtype)
99+
atlas = CIDAtlasDataConstructor(source_dir=SOURCE_DIR['atlas'], trade_classification=classification, dtype=dtype)
100+
atlas.construct_standardized_dataset()
101+
#-Store-#
102+
startyear = atlas.dataset.year.min()
103+
endyear = atlas.dataset.year.max()
104+
fln = TARGET_DATASET_DIR["atlas"] + "cidatlas_%s_%s_rca_%sto%s.h5"%(classification.lower(), dtype, startyear, endyear)
105+
store = pd.HDFStore(fln, complevel=9, complib='zlib')
106+
#-Country RCA Data-#
107+
atlas.countries_only()
108+
countrydata = atlas.dataset.copy(deep=True)
109+
if classification == "SITCR2":
110+
if dtype == "export":
111+
idx = ["year", "eiso3c", "sitc4"]
112+
elif dtype == "import":
113+
idx = ["year", "iiso3c", "sitc4"]
114+
if classification == "HS92":
115+
if dtype == "export":
116+
idx = ["year", "eiso3c", "hs4"]
117+
elif dtype == "import":
118+
idx = ["year", "iiso3c", "hs4"]
119+
countrydata = countrydata.groupby(idx).sum()["rca"].reset_index()
120+
store.put("L4", countrydata, format="table")
121+
store.close()
122+
del countrydata
123+
gc.collect()
124+
125+
126+
#--------------------------#
127+
#-Penn World Table Dataset-#
128+
#--------------------------#
129+
130+
if COMPILE_PENN:
131+
print "[INFO] Processing PENN World Tables ... "
132+
penn = PENN(source_dir=SOURCE_DIR['penn'])
133+
penn.to_hdf(fl="penn_%s_%sto%s.h5"%(penn.version, penn.start_year, penn.end_year), target_dir=TARGET_DATASET_DIR["penn"])
134+
penn.to_stata(fl="penn_%s_%sto%s.dta"%(penn.version, penn.start_year, penn.end_year), target_dir=TARGET_DATASET_DIR["penn"])

‎dataset_info.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
Dataset Info
3+
============
4+
5+
Author: Matthew McKay (mamckay@gmail.com)
6+
7+
Central Source of Dataset Information and Directory Structure etc.
8+
9+
"""
10+
11+
import sys
12+
import os
13+
14+
if sys.platform.startswith('win'):
15+
DATA_DIR = r"D:/work-data/datasets/"
16+
elif sys.platform.startswith('darwin') or sys.platform.startswith('linux'):
17+
abs_path = os.path.expanduser("~")
18+
DATA_DIR = abs_path + "/work-data/datasets/"
19+
20+
#-Source Information-#
21+
22+
SOURCE_DIR = {
23+
"nber" : DATA_DIR + "36a376e5a01385782112519bddfac85e" + "/",
24+
"baci96" : DATA_DIR + "e988b6544563675492b59f397a8cb6bb" + "/",
25+
"wdi" : DATA_DIR + "70146f20cf40f818e6733d552c6cabb5" + "/",
26+
"atlas" : DATA_DIR + "2d48c79173719bd41eb5e192fb4470b6" + "/",
27+
"penn" : DATA_DIR + "2c2e8d593f39ee74aeb2c7c17047ea3f" + "/",
28+
"waziarg" : DATA_DIR + "e93e2009b02d39655f1beb5bcaaf04a8" + "/",
29+
}
30+
31+
#-Check Environment Settings-#
32+
for source in SOURCE_DIR.keys():
33+
if not os.path.isdir(SOURCE_DIR[source]):
34+
raise ValueError("Directory: %s is not found!" % SOURCE_DIR[source])
35+
36+
#-Target Information-#
37+
38+
TARGET_RAW_DIR = {
39+
"nber" : "./output/raw/",
40+
"baci96" : "./output/raw/",
41+
"wdi" : "./output/raw/",
42+
}
43+
44+
TARGET_DATASET_DIR = {
45+
"nber" : "./output/dataset/nber/",
46+
"baci96" : "./output/dataset/baci96/",
47+
"nberbaci96" : "./output/dataset/nberbaci96/",
48+
"regression" : "./output/dataset/regression/",
49+
"atlas" : "./output/dataset/atlas/",
50+
"wdi" : "./output/dataset/wdi/",
51+
"penn" : "./output/dataset/penn/"
52+
}
53+
54+
RESULTS_DIR = {
55+
#-General Analysis-#
56+
"nber" : "./output/results/nber/",
57+
"baci96" : "./output/results/baci96/",
58+
"nberbaci96" : "./output/results/nberbaci96/",
59+
"atlas" : "./output/results/atlas/",
60+
}
61+
62+
#-Dataset Attributes-#
63+
YEARS = {
64+
"nber" : (1962,2000),
65+
"baci96" : (1998,2012)
66+
}
67+
68+
#-Thesis Chapter Level Results-#
69+
CHAPTER_RESULTS = {
70+
1 : "./output/chapter1/",
71+
2 : "./output/chapter2/",
72+
3 : "./output/chapter3/",
73+
4 : "./output/chapter4/",
74+
5 : "./output/chapter5/",
75+
6 : "./output/chapter6/",
76+
"A" : "./output/appendixA/",
77+
"B" : "./output/appendixB/",
78+
"C" : "./output/appendixC/",
79+
"D" : "./output/appendixD/",
80+
"G" : "./output/appendixG/",
81+
}

‎setup.py

+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""
2+
PhD Thesis Setup
3+
================
4+
5+
This script will setup the thesis project and any required folders etc.
6+
7+
"""
8+
9+
import os
10+
11+
FOLDERS = [
12+
#-Output Directories-#
13+
"output",
14+
"output/raw", #Files Should be clearly marked as raw_sources.h5
15+
"output/dataset/",
16+
"output/dataset/nber",
17+
"output/dataset/nber/Y7400/",
18+
"output/dataset/nber/Y8400/",
19+
"output/dataset/baci96",
20+
"output/dataset/baci96/harmonised",
21+
"output/dataset/baci96/harmonised/Y7400/",
22+
"output/dataset/baci96/harmonised/Y8400/",
23+
"output/dataset/nberbaci96",
24+
"output/dataset/nberbaci96/Y7400/",
25+
"output/dataset/nberbaci96/Y8400",
26+
"output/dataset/regression/",
27+
"output/dataset/atlas/",
28+
"output/dataset/wdi/",
29+
"output/dataset/penn/",
30+
31+
#-NBER Results-#
32+
"output/results/nber",
33+
"output/results/nber/intertemporal-productcodes/",
34+
"output/results/nber/intertemporal-productcodes/Y7400/",
35+
"output/results/nber/intertemporal-productcodes/Y8400/",
36+
"output/results/nber/intertemporal-productcodes-sitcl4/",
37+
"output/results/nber/intertemporal-productcodes-sitcl4/raw/",
38+
"output/results/nber/intertemporal-productcodes-sitcl4/plots/",
39+
"output/results/nber/intertemporal-productcodes-sitcl3/",
40+
"output/results/nber/intertemporal-productcodes-sitcl3/raw/",
41+
"output/results/nber/intertemporal-productcodes-sitcl2/",
42+
"output/results/nber/intertemporal-productcodes-sitcl2/raw/",
43+
"output/results/nber/intertemporal-exporters/",
44+
"output/results/nber/intertemporal-exporters/raw/",
45+
"output/results/nber/intertemporal-countrycodes/",
46+
"output/results/nber/intertemporal-countrycodes/raw/",
47+
#-NBER Tables-#
48+
"output/results/nber/tables/",
49+
"output/results/nber/tables/Y7400/",
50+
"output/results/nber/tables/Y8400/",
51+
#-NBER Plots-#
52+
"output/results/nber/plots/",
53+
"output/results/nber/plots/percent_unofficial_codes/",
54+
"output/results/nber/plots/percent_world_values/",
55+
"output/results/nber/plots/percent_world_values/Y7400/",
56+
"output/results/nber/plots/percent_world_values/Y8400/",
57+
58+
#-BACI96 Results-#
59+
"output/results/baci96",
60+
"output/results/baci96/intertemporal-countrycodes/",
61+
"output/results/baci96/intertemporal-productcodes/",
62+
#-BACI96 Tables-#
63+
"output/results/baci96/tables/",
64+
#-BACI96 Plots-#
65+
"output/results/baci96/plots/",
66+
"output/results/baci96/plots/percent_world_values/",
67+
68+
#-Combined Dataset Results-#
69+
"output/results/nberbaci96",
70+
"output/results/nberbaci96/intertemporal-countrycodes/",
71+
"output/results/nberbaci96/intertemporal-countrycodes/Y7400/",
72+
"output/results/nberbaci96/intertemporal-countrycodes/Y8400/",
73+
"output/results/nberbaci96/intertemporal-productcodes/",
74+
"output/results/nberbaci96/intertemporal-productcodes/Y7400/",
75+
"output/results/nberbaci96/intertemporal-productcodes/Y8400/",
76+
#-NBERBACI Tables-#
77+
"output/results/nberbaci96/tables/",
78+
"output/results/nberbaci96/tables/Y7400/",
79+
"output/results/nberbaci96/tables/Y8400/",
80+
#-NBERBACI Plots-#
81+
"output/results/nberbaci96/plots/",
82+
"output/results/nberbaci96/plots/percent_world_values/",
83+
84+
#-Atlas of Complexity-#
85+
"output/results/atlas/",
86+
"output/results/atlas/intertemporal-productcodes/",
87+
"output/results/atlas/intertemporal-countrycodes/",
88+
"output/results/atlas/tables/",
89+
"output/results/atlas/plots/",
90+
"output/results/atlas/plots/intertemporal-productcodes-num/",
91+
92+
#-Chapter and Appendix Working Areas-#
93+
"output/chapter1/",
94+
"output/chapter2/",
95+
"output/chapter3/",
96+
"output/chapter3/sensativity-analysis/",
97+
"output/chapter3/plots/",
98+
"output/chapter4/",
99+
"output/chapter5/",
100+
"output/chapter6/",
101+
"output/appendixA/",
102+
"output/appendixB/",
103+
"output/appendixC/",
104+
"output/appendixD/",
105+
"output/appendixG/",
106+
107+
#-Log Directory-#
108+
"log/",
109+
110+
]
111+
112+
#-Setup Folders-#
113+
114+
for folder in FOLDERS:
115+
if not os.path.exists(folder):
116+
print "[Setup] Creating directory: %s" % folder
117+
os.makedirs(folder)

0 commit comments

Comments
 (0)
Please sign in to comment.