Skip to content

Commit 00814da

Browse files
committed
init
1 parent e486b80 commit 00814da

File tree

9,436 files changed

+3361662
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

9,436 files changed

+3361662
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"folders": [
3+
{
4+
"path": "."
5+
},
6+
{
7+
"path": "../data_notebooks-ALL"
8+
},
9+
{
10+
"path": "../data_GCP Pro Data Eng Certification/GCP Learning"
11+
},
12+
{
13+
"path": "../../GitHub-Repo/Wrangling_PySpark"
14+
}
15+
],
16+
"settings": {
17+
"jupyter.kernels.filter": [
18+
{
19+
"path": "/Users/JCachat/Library/Jupyter/kernels/data/kernel.json",
20+
"type": "jupyterKernelspec"
21+
},
22+
{
23+
"path": "/Users/JCachat/Library/Jupyter/kernels/mito_env/kernel.json",
24+
"type": "jupyterKernelspec"
25+
},
26+
{
27+
"path": "/usr/bin/python3",
28+
"type": "pythonEnvironment"
29+
},
30+
{
31+
"path": "/usr/local/bin/python3",
32+
"type": "pythonEnvironment"
33+
},
34+
{
35+
"path": "/Users/JCachat/Library/Jupyter/kernels/timeseries/kernel.json",
36+
"type": "jupyterKernelspec"
37+
}
38+
]
39+
}
40+
}

.DS_Store

0 Bytes
Binary file not shown.

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
2+
*/.ipynb_checkpoints
3+
data

0-Ingest/.DS_Store

6 KB
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Accessing the Public Data API with Python (Jupyter Notebook & Pandas)\n",
8+
"\n",
9+
"API Version 2.0 Sample Code Updated bls.gov 'API Version 2.0 Python Sample Code', by Mark McEnearney, to use Pandas/Jupyter Notebook.\n"
10+
]
11+
},
12+
{
13+
"cell_type": "markdown",
14+
"metadata": {},
15+
"source": [
16+
"### Imports"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 1,
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"import requests\n",
26+
"import json\n",
27+
"import pandas as pd"
28+
]
29+
},
30+
{
31+
"cell_type": "markdown",
32+
"metadata": {},
33+
"source": [
34+
"### Build and submit request, then format response as JSON \n",
35+
"Requesting data for two series, LASST060000000000003 (California Unemployment Rate) and LASST080000000000003 (Colorado Unemployment), between the years 2015 and 2022"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 2,
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"headers = {'Content-type': 'application/json'}\n",
45+
"data = json.dumps({\"seriesid\": ['LASST010000000000003','LASST020000000000003','LASST040000000000003','LASST050000000000003','LASST060000000000003','LASST080000000000003'],\"startyear\":\"2017\", \"endyear\":\"2022\",\"registrationkey\":\"8988511dabfc4508a0f08c0c051c1476\"})\n",
46+
"\n",
47+
"p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers)\n",
48+
"\n",
49+
"json_data = json.loads(p.text)"
50+
]
51+
},
52+
{
53+
"cell_type": "markdown",
54+
"metadata": {},
55+
"source": [
56+
"### Iterate through series data to build a list"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": 3,
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"parsed_data = []\n",
66+
"for series in json_data['Results']['series']:\n",
67+
" seriesId = series['seriesID']\n",
68+
" for item in series['data']:\n",
69+
" year = item['year']\n",
70+
" period = item['period']\n",
71+
" value = item['value']\n",
72+
" parsed_data.append([seriesId,year,period,value])\n"
73+
]
74+
},
75+
{
76+
"cell_type": "markdown",
77+
"metadata": {},
78+
"source": [
79+
"### Convert parsed list data into a Pandas DataFrame"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": 4,
85+
"metadata": {},
86+
"outputs": [],
87+
"source": [
88+
"df = pd.DataFrame(parsed_data, columns=['seriesID', 'year', 'period', 'value'])"
89+
]
90+
},
91+
{
92+
"cell_type": "markdown",
93+
"metadata": {},
94+
"source": [
95+
"### Describe and display dataframe"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": 5,
101+
"metadata": {},
102+
"outputs": [
103+
{
104+
"data": {
105+
"text/html": [
106+
"<div>\n",
107+
"<style scoped>\n",
108+
" .dataframe tbody tr th:only-of-type {\n",
109+
" vertical-align: middle;\n",
110+
" }\n",
111+
"\n",
112+
" .dataframe tbody tr th {\n",
113+
" vertical-align: top;\n",
114+
" }\n",
115+
"\n",
116+
" .dataframe thead th {\n",
117+
" text-align: right;\n",
118+
" }\n",
119+
"</style>\n",
120+
"<table border=\"1\" class=\"dataframe\">\n",
121+
" <thead>\n",
122+
" <tr style=\"text-align: right;\">\n",
123+
" <th></th>\n",
124+
" <th>seriesID</th>\n",
125+
" <th>year</th>\n",
126+
" <th>period</th>\n",
127+
" <th>value</th>\n",
128+
" </tr>\n",
129+
" </thead>\n",
130+
" <tbody>\n",
131+
" <tr>\n",
132+
" <th>0</th>\n",
133+
" <td>LASST010000000000003</td>\n",
134+
" <td>2022</td>\n",
135+
" <td>M06</td>\n",
136+
" <td>2.6</td>\n",
137+
" </tr>\n",
138+
" <tr>\n",
139+
" <th>1</th>\n",
140+
" <td>LASST010000000000003</td>\n",
141+
" <td>2022</td>\n",
142+
" <td>M05</td>\n",
143+
" <td>2.7</td>\n",
144+
" </tr>\n",
145+
" <tr>\n",
146+
" <th>2</th>\n",
147+
" <td>LASST010000000000003</td>\n",
148+
" <td>2022</td>\n",
149+
" <td>M04</td>\n",
150+
" <td>2.8</td>\n",
151+
" </tr>\n",
152+
" <tr>\n",
153+
" <th>3</th>\n",
154+
" <td>LASST010000000000003</td>\n",
155+
" <td>2022</td>\n",
156+
" <td>M03</td>\n",
157+
" <td>2.9</td>\n",
158+
" </tr>\n",
159+
" <tr>\n",
160+
" <th>4</th>\n",
161+
" <td>LASST010000000000003</td>\n",
162+
" <td>2022</td>\n",
163+
" <td>M02</td>\n",
164+
" <td>3.0</td>\n",
165+
" </tr>\n",
166+
" <tr>\n",
167+
" <th>...</th>\n",
168+
" <td>...</td>\n",
169+
" <td>...</td>\n",
170+
" <td>...</td>\n",
171+
" <td>...</td>\n",
172+
" </tr>\n",
173+
" <tr>\n",
174+
" <th>391</th>\n",
175+
" <td>LASST080000000000003</td>\n",
176+
" <td>2017</td>\n",
177+
" <td>M05</td>\n",
178+
" <td>2.4</td>\n",
179+
" </tr>\n",
180+
" <tr>\n",
181+
" <th>392</th>\n",
182+
" <td>LASST080000000000003</td>\n",
183+
" <td>2017</td>\n",
184+
" <td>M04</td>\n",
185+
" <td>2.4</td>\n",
186+
" </tr>\n",
187+
" <tr>\n",
188+
" <th>393</th>\n",
189+
" <td>LASST080000000000003</td>\n",
190+
" <td>2017</td>\n",
191+
" <td>M03</td>\n",
192+
" <td>2.4</td>\n",
193+
" </tr>\n",
194+
" <tr>\n",
195+
" <th>394</th>\n",
196+
" <td>LASST080000000000003</td>\n",
197+
" <td>2017</td>\n",
198+
" <td>M02</td>\n",
199+
" <td>2.5</td>\n",
200+
" </tr>\n",
201+
" <tr>\n",
202+
" <th>395</th>\n",
203+
" <td>LASST080000000000003</td>\n",
204+
" <td>2017</td>\n",
205+
" <td>M01</td>\n",
206+
" <td>2.6</td>\n",
207+
" </tr>\n",
208+
" </tbody>\n",
209+
"</table>\n",
210+
"<p>396 rows × 4 columns</p>\n",
211+
"</div>"
212+
],
213+
"text/plain": [
214+
" seriesID year period value\n",
215+
"0 LASST010000000000003 2022 M06 2.6\n",
216+
"1 LASST010000000000003 2022 M05 2.7\n",
217+
"2 LASST010000000000003 2022 M04 2.8\n",
218+
"3 LASST010000000000003 2022 M03 2.9\n",
219+
"4 LASST010000000000003 2022 M02 3.0\n",
220+
".. ... ... ... ...\n",
221+
"391 LASST080000000000003 2017 M05 2.4\n",
222+
"392 LASST080000000000003 2017 M04 2.4\n",
223+
"393 LASST080000000000003 2017 M03 2.4\n",
224+
"394 LASST080000000000003 2017 M02 2.5\n",
225+
"395 LASST080000000000003 2017 M01 2.6\n",
226+
"\n",
227+
"[396 rows x 4 columns]"
228+
]
229+
},
230+
"metadata": {},
231+
"output_type": "display_data"
232+
}
233+
],
234+
"source": [
235+
"display(df)"
236+
]
237+
}
238+
],
239+
"metadata": {
240+
"kernelspec": {
241+
"display_name": "Python 3.9.13 ('data')",
242+
"language": "python",
243+
"name": "python3"
244+
},
245+
"language_info": {
246+
"codemirror_mode": {
247+
"name": "ipython",
248+
"version": 3
249+
},
250+
"file_extension": ".py",
251+
"mimetype": "text/x-python",
252+
"name": "python",
253+
"nbconvert_exporter": "python",
254+
"pygments_lexer": "ipython3",
255+
"version": "3.9.13"
256+
},
257+
"vscode": {
258+
"interpreter": {
259+
"hash": "61f816485507dc889ff66c817d3dfe3a5ba58d9e76a81eab42366b9b45bfe58a"
260+
}
261+
}
262+
},
263+
"nbformat": 4,
264+
"nbformat_minor": 4
265+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# using INFORMATION_SCHEMA.views & ASSERT to test BigQuery SQL Statements
2+
3+
4+
https://cloud.google.com/bigquery/docs/information-schema-views
5+
6+
7+
https://medium.com/google-cloud/validating-successful-execution-of-bigquery-scripts-using-assert-c82f7ff9cfa8
8+
9+
10+
11+
You can make a smaller randomly sampled version of your table:
12+
13+
CREATE TABLE `project.testdataset.tablename`
14+
15+
AS SELECT * FROM `project.proddataset.tablename` WHERE RAND() > 0.9
16+
17+
to get 10% of the rows. Or 0.01 to get 1%. Run it more than once and you'll get different rows of course, since RAND() is random. Hash a timestamp to get repeatable results.
18+
19+
Then all you're changing is the dataset name between test and prod.
20+
21+
22+
https://ianwhitestone.work/testing-sql/
23+
24+
https://pypi.org/project/bq-test-kit/
25+

0 commit comments

Comments
 (0)