Skip to content

Commit ef1b085

Browse files
committed
ipynb to create csv
1 parent a88f1dd commit ef1b085

File tree

2 files changed

+2710
-0
lines changed

2 files changed

+2710
-0
lines changed

create_dataset.ipynb

+349
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 39,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import json\n",
10+
"import os\n",
11+
"import sys\n",
12+
"import pandas as pd\n",
13+
"from pandas import json_normalize\n",
14+
"# Credits: https://github.com/agalea91/city_to_state_dictionary/blob/master/city_to_state.py\n",
15+
"from states import city_to_state_dict"
16+
]
17+
},
18+
{
19+
"cell_type": "code",
20+
"execution_count": null,
21+
"metadata": {},
22+
"outputs": [],
23+
"source": [
24+
"# Current directory\n",
25+
"dirName = 'JSON_tweets/'\n",
26+
"dirTweets= sorted(os.listdir(dirName))\n",
27+
"\n",
28+
"df = pd.DataFrame(columns=['id','created_at','text','location','lang'])\n",
29+
"\n",
30+
"for tweet in dirTweets:\n",
31+
" with open(dirName+tweet) as f:\n",
32+
" all_tweet = json.load(f)\n",
33+
" for i in range(1, len(all_tweet)-1):\n",
34+
" try:\n",
35+
" info = all_tweet[i]['row']['columns']\n",
36+
" except:\n",
37+
" print(all_tweet[i])\n",
38+
" new_json = {\n",
39+
" 'id': info[2],\n",
40+
" 'created_at': info[1],\n",
41+
" 'text': info[3],\n",
42+
" 'location': info[6]['LOCATION'],\n",
43+
" 'lang': info[6]['LANG']\n",
44+
" }\n",
45+
" df = df.append(json_normalize(new_json))"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": 36,
51+
"metadata": {},
52+
"outputs": [
53+
{
54+
"data": {
55+
"text/html": [
56+
"<div>\n",
57+
"<style scoped>\n",
58+
" .dataframe tbody tr th:only-of-type {\n",
59+
" vertical-align: middle;\n",
60+
" }\n",
61+
"\n",
62+
" .dataframe tbody tr th {\n",
63+
" vertical-align: top;\n",
64+
" }\n",
65+
"\n",
66+
" .dataframe thead th {\n",
67+
" text-align: right;\n",
68+
" }\n",
69+
"</style>\n",
70+
"<table border=\"1\" class=\"dataframe\">\n",
71+
" <thead>\n",
72+
" <tr style=\"text-align: right;\">\n",
73+
" <th></th>\n",
74+
" <th>id</th>\n",
75+
" <th>created_at</th>\n",
76+
" <th>text</th>\n",
77+
" <th>location</th>\n",
78+
" <th>lang</th>\n",
79+
" </tr>\n",
80+
" </thead>\n",
81+
" <tbody>\n",
82+
" <tr>\n",
83+
" <th>0</th>\n",
84+
" <td>1337837525155663875</td>\n",
85+
" <td>1607800296000</td>\n",
86+
" <td>@SpartyHicks @FoxNews We got the #Oil the worl...</td>\n",
87+
" <td>Texas</td>\n",
88+
" <td>None</td>\n",
89+
" </tr>\n",
90+
" <tr>\n",
91+
" <th>0</th>\n",
92+
" <td>1337837528758575113</td>\n",
93+
" <td>1607800297000</td>\n",
94+
" <td>RT @Forbes: Meet the Fiskers, the billionaire ...</td>\n",
95+
" <td>Canada</td>\n",
96+
" <td>None</td>\n",
97+
" </tr>\n",
98+
" <tr>\n",
99+
" <th>0</th>\n",
100+
" <td>1337837534248898561</td>\n",
101+
" <td>1607800298000</td>\n",
102+
" <td>@toxicpath It’s a pleasant conspiracy theory o...</td>\n",
103+
" <td>Kansas City, MO</td>\n",
104+
" <td>None</td>\n",
105+
" </tr>\n",
106+
" <tr>\n",
107+
" <th>0</th>\n",
108+
" <td>1337837537948262402</td>\n",
109+
" <td>1607800299000</td>\n",
110+
" <td>@dealer_of_happy 1st Tesla-world problem 😉</td>\n",
111+
" <td>None</td>\n",
112+
" <td>None</td>\n",
113+
" </tr>\n",
114+
" <tr>\n",
115+
" <th>0</th>\n",
116+
" <td>1337837550216613888</td>\n",
117+
" <td>1607800302000</td>\n",
118+
" <td>RT @discord: ok this year's snowsgiving giveaw...</td>\n",
119+
" <td>None</td>\n",
120+
" <td>None</td>\n",
121+
" </tr>\n",
122+
" </tbody>\n",
123+
"</table>\n",
124+
"</div>"
125+
],
126+
"text/plain": [
127+
" id created_at \\\n",
128+
"0 1337837525155663875 1607800296000 \n",
129+
"0 1337837528758575113 1607800297000 \n",
130+
"0 1337837534248898561 1607800298000 \n",
131+
"0 1337837537948262402 1607800299000 \n",
132+
"0 1337837550216613888 1607800302000 \n",
133+
"\n",
134+
" text location lang \n",
135+
"0 @SpartyHicks @FoxNews We got the #Oil the worl... Texas None \n",
136+
"0 RT @Forbes: Meet the Fiskers, the billionaire ... Canada None \n",
137+
"0 @toxicpath It’s a pleasant conspiracy theory o... Kansas City, MO None \n",
138+
"0 @dealer_of_happy 1st Tesla-world problem 😉 None None \n",
139+
"0 RT @discord: ok this year's snowsgiving giveaw... None None "
140+
]
141+
},
142+
"execution_count": 36,
143+
"metadata": {},
144+
"output_type": "execute_result"
145+
}
146+
],
147+
"source": [
148+
"df.head()"
149+
]
150+
},
151+
{
152+
"cell_type": "code",
153+
"execution_count": null,
154+
"metadata": {},
155+
"outputs": [],
156+
"source": [
157+
"csv_file = 'prueba.csv'\n",
158+
"df.to_csv(csv_file)"
159+
]
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": null,
164+
"metadata": {},
165+
"outputs": [],
166+
"source": [
167+
"\"\"\"\n",
168+
" TODO: Add csv to S3\n",
169+
"\"\"\""
170+
]
171+
},
172+
{
173+
"cell_type": "code",
174+
"execution_count": null,
175+
"metadata": {},
176+
"outputs": [],
177+
"source": [
178+
"import boto3\n",
179+
"s3 = boto3.resource('s3')\n",
180+
"s3.meta.client.upload_file(csv_file, 'mybucket', csv_file)"
181+
]
182+
},
183+
{
184+
"cell_type": "code",
185+
"execution_count": null,
186+
"metadata": {},
187+
"outputs": [],
188+
"source": [
189+
"\"\"\"\n",
190+
" Attempt to transform cities into states\n",
191+
"\"\"\""
192+
]
193+
},
194+
{
195+
"cell_type": "code",
196+
"execution_count": 49,
197+
"metadata": {},
198+
"outputs": [
199+
{
200+
"name": "stdout",
201+
"output_type": "stream",
202+
"text": [
203+
"Texas\n",
204+
"other= Canada\n",
205+
"Kansas\n",
206+
"other= MO\n",
207+
"California\n",
208+
"other= CA\n",
209+
"other= United States\n",
210+
"other= Iceland\n",
211+
"other= fede • emma • luca • taís\n",
212+
"other= Osorno\n",
213+
"other= Chile\n",
214+
"other= ARIZONA\n",
215+
"other= 59.932094\n",
216+
"other= 30.335732\n",
217+
"other= ATL GA USA\n",
218+
"Texas\n",
219+
"other= Dildo\n",
220+
"other= NL\n",
221+
"New York\n",
222+
"other= NY\n",
223+
"other= Maui\n",
224+
"other= Hawaii\n",
225+
"other= FL\n",
226+
"Nebraska\n",
227+
"other= USA\n",
228+
"other= she/her\n",
229+
"Missouri\n",
230+
"other= MO\n",
231+
"Oregon\n",
232+
"other= ME\n",
233+
"California\n",
234+
"other= CA\n",
235+
"other= he/him\n",
236+
"California\n",
237+
"other= CA\n",
238+
"Washington\n",
239+
"other= USA\n",
240+
"other= Oceania\n",
241+
"other= México\n",
242+
"other= Your moms house\n",
243+
"California\n",
244+
"other= CA\n",
245+
"California\n",
246+
"other= CA\n",
247+
"other= South Africa\n",
248+
"other= Maui\n",
249+
"other= Hawaii\n",
250+
"other= The milk bar\n",
251+
"other= IL\n",
252+
"other= Badajoz\n",
253+
"other= Spain\n",
254+
"Minnesota\n",
255+
"other= TX\n",
256+
"other= Vancouver Island BC CANADA\n",
257+
"Michigan\n",
258+
"other= MI/Dallas\n",
259+
"other= TX\n",
260+
"other= Western Finland\n",
261+
"other= Lake Mary\n",
262+
"other= FL\n",
263+
"other= 1930s USA aka Florida.\n",
264+
"other= ults: exo | got7 | txt\n",
265+
"other= Maui\n",
266+
"other= Hawaii\n",
267+
"California\n",
268+
"other= CA\n",
269+
"other= Bogotá\n",
270+
"other= Colombia\n",
271+
"New York\n",
272+
"other= N.Y.\n",
273+
"other= 🌴🐰👑🍑🌞🐍🌼\n",
274+
"other= Western Finland\n",
275+
"other= são paulo\n",
276+
"other= She/They\n",
277+
"Maryland\n",
278+
"other= TN\n",
279+
"other= USA\n",
280+
"other= Mythical land called Sanity\n",
281+
"Ohio\n",
282+
"other= IA\n",
283+
"other= United States\n",
284+
"other= 647.218.2414\n",
285+
"New Hampshire\n",
286+
"other= England\n",
287+
"other= Deutschland\n",
288+
"Washington\n",
289+
"other= WA\n",
290+
"other= Wien\n",
291+
"other= Österreich\n",
292+
"other= Cambodia\n",
293+
"other= GA\n",
294+
"California\n",
295+
"other= California\n",
296+
"Florida\n",
297+
"other= FL\n",
298+
"Minnesota\n",
299+
"other= MN\n",
300+
"Illinois\n",
301+
"other= IL\n",
302+
"other= South Africa\n"
303+
]
304+
}
305+
],
306+
"source": [
307+
"for l in df['location']:\n",
308+
" if l is not None:\n",
309+
" l = l.strip()\n",
310+
" words = l.split(\",\")\n",
311+
" for w in words:\n",
312+
" if w in city_to_state_dict.values():\n",
313+
" print(w)\n",
314+
" elif w in city_to_state_dict.keys():\n",
315+
" print(city_to_state_dict[w])\n",
316+
" else:\n",
317+
" print(\"other= %s\" %(w)) "
318+
]
319+
},
320+
{
321+
"cell_type": "code",
322+
"execution_count": null,
323+
"metadata": {},
324+
"outputs": [],
325+
"source": []
326+
}
327+
],
328+
"metadata": {
329+
"kernelspec": {
330+
"display_name": "Python 3",
331+
"language": "python",
332+
"name": "python3"
333+
},
334+
"language_info": {
335+
"codemirror_mode": {
336+
"name": "ipython",
337+
"version": 3
338+
},
339+
"file_extension": ".py",
340+
"mimetype": "text/x-python",
341+
"name": "python",
342+
"nbconvert_exporter": "python",
343+
"pygments_lexer": "ipython3",
344+
"version": "3.7.7"
345+
}
346+
},
347+
"nbformat": 4,
348+
"nbformat_minor": 4
349+
}

0 commit comments

Comments
 (0)