DRAE-MLP
diff --git a/‎create_dataset.ipynb
+349 b/‎create_dataset.ipynb
+349
@@ -0,0 +1,349 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "import sys\n",
+    "import pandas as pd\n",
+    "from pandas import json_normalize\n",
+    "# Credits: https://github.com/agalea91/city_to_state_dictionary/blob/master/city_to_state.py\n",
+    "from states import city_to_state_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Current directory\n",
+    "dirName = 'JSON_tweets/'\n",
+    "dirTweets= sorted(os.listdir(dirName))\n",
+    "\n",
+    "df = pd.DataFrame(columns=['id','created_at','text','location','lang'])\n",
+    "\n",
+    "for tweet in dirTweets:\n",
+    "    with open(dirName+tweet) as f:\n",
+    "        all_tweet = json.load(f)\n",
+    "        for i in range(1, len(all_tweet)-1):\n",
+    "            try:\n",
+    "                info = all_tweet[i]['row']['columns']\n",
+    "            except:\n",
+    "                print(all_tweet[i])\n",
+    "            new_json = {\n",
+    "                'id': info[2],\n",
+    "                'created_at': info[1],\n",
+    "                'text': info[3],\n",
+    "                'location': info[6]['LOCATION'],\n",
+    "                'lang': info[6]['LANG']\n",
+    "            }\n",
+    "            df = df.append(json_normalize(new_json))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>created_at</th>\n",
+       "      <th>text</th>\n",
+       "      <th>location</th>\n",
+       "      <th>lang</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1337837525155663875</td>\n",
+       "      <td>1607800296000</td>\n",
+       "      <td>@SpartyHicks @FoxNews We got the #Oil the worl...</td>\n",
+       "      <td>Texas</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1337837528758575113</td>\n",
+       "      <td>1607800297000</td>\n",
+       "      <td>RT @Forbes: Meet the Fiskers, the billionaire ...</td>\n",
+       "      <td>Canada</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1337837534248898561</td>\n",
+       "      <td>1607800298000</td>\n",
+       "      <td>@toxicpath It’s a pleasant conspiracy theory o...</td>\n",
+       "      <td>Kansas City, MO</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1337837537948262402</td>\n",
+       "      <td>1607800299000</td>\n",
+       "      <td>@dealer_of_happy 1st Tesla-world problem 😉</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1337837550216613888</td>\n",
+       "      <td>1607800302000</td>\n",
+       "      <td>RT @discord: ok this year's snowsgiving giveaw...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    id     created_at  \\\n",
+       "0  1337837525155663875  1607800296000   \n",
+       "0  1337837528758575113  1607800297000   \n",
+       "0  1337837534248898561  1607800298000   \n",
+       "0  1337837537948262402  1607800299000   \n",
+       "0  1337837550216613888  1607800302000   \n",
+       "\n",
+       "                                                text         location  lang  \n",
+       "0  @SpartyHicks @FoxNews We got the #Oil the worl...            Texas  None  \n",
+       "0  RT @Forbes: Meet the Fiskers, the billionaire ...           Canada  None  \n",
+       "0  @toxicpath It’s a pleasant conspiracy theory o...  Kansas City, MO  None  \n",
+       "0         @dealer_of_happy 1st Tesla-world problem 😉             None  None  \n",
+       "0  RT @discord: ok this year's snowsgiving giveaw...             None  None  "
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "csv_file = 'prueba.csv'\n",
+    "df.to_csv(csv_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "    TODO: Add csv to S3\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "s3 = boto3.resource('s3')\n",
+    "s3.meta.client.upload_file(csv_file, 'mybucket', csv_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "    Attempt to transform cities into states\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Texas\n",
+      "other= Canada\n",
+      "Kansas\n",
+      "other=  MO\n",
+      "California\n",
+      "other=  CA\n",
+      "other= United States\n",
+      "other= Iceland\n",
+      "other= fede • emma • luca • taís\n",
+      "other= Osorno\n",
+      "other=  Chile\n",
+      "other= ARIZONA\n",
+      "other= 59.932094\n",
+      "other= 30.335732\n",
+      "other= ATL GA USA\n",
+      "Texas\n",
+      "other= Dildo\n",
+      "other=  NL\n",
+      "New York\n",
+      "other=  NY\n",
+      "other= Maui\n",
+      "other=  Hawaii\n",
+      "other= FL\n",
+      "Nebraska\n",
+      "other=  USA\n",
+      "other= she/her\n",
+      "Missouri\n",
+      "other=  MO\n",
+      "Oregon\n",
+      "other=  ME\n",
+      "California\n",
+      "other=  CA\n",
+      "other= he/him\n",
+      "California\n",
+      "other=  CA\n",
+      "Washington\n",
+      "other=  USA\n",
+      "other= Oceania\n",
+      "other= México\n",
+      "other= Your moms house\n",
+      "California\n",
+      "other=  CA\n",
+      "California\n",
+      "other=  CA\n",
+      "other= South Africa\n",
+      "other= Maui\n",
+      "other=  Hawaii\n",
+      "other= The milk bar\n",
+      "other=  IL\n",
+      "other= Badajoz\n",
+      "other=  Spain\n",
+      "Minnesota\n",
+      "other=  TX\n",
+      "other= Vancouver Island BC CANADA\n",
+      "Michigan\n",
+      "other=  MI/Dallas\n",
+      "other=  TX\n",
+      "other= Western Finland\n",
+      "other= Lake Mary\n",
+      "other=  FL\n",
+      "other= 1930s USA aka Florida.\n",
+      "other= ults: exo | got7 | txt\n",
+      "other= Maui\n",
+      "other=  Hawaii\n",
+      "California\n",
+      "other=  CA\n",
+      "other= Bogotá\n",
+      "other=  Colombia\n",
+      "New York\n",
+      "other=  N.Y.\n",
+      "other= 🌴🐰👑🍑🌞🐍🌼\n",
+      "other= Western Finland\n",
+      "other= são paulo\n",
+      "other= She/They\n",
+      "Maryland\n",
+      "other=  TN\n",
+      "other=  USA\n",
+      "other= Mythical land called Sanity\n",
+      "Ohio\n",
+      "other=  IA\n",
+      "other=  United States\n",
+      "other= 647.218.2414\n",
+      "New Hampshire\n",
+      "other=  England\n",
+      "other= Deutschland\n",
+      "Washington\n",
+      "other=  WA\n",
+      "other= Wien\n",
+      "other=  Österreich\n",
+      "other= Cambodia\n",
+      "other= GA\n",
+      "California\n",
+      "other=  California\n",
+      "Florida\n",
+      "other=  FL\n",
+      "Minnesota\n",
+      "other=  MN\n",
+      "Illinois\n",
+      "other=  IL\n",
+      "other= South Africa\n"
+     ]
+    }
+   ],
+   "source": [
+    "for l in df['location']:\n",
+    "    if l is not None:\n",
+    "        l = l.strip()\n",
+    "        words = l.split(\",\")\n",
+    "        for w in words:\n",
+    "            if w in city_to_state_dict.values():\n",
+    "                print(w)\n",
+    "            elif w in city_to_state_dict.keys():\n",
+    "                    print(city_to_state_dict[w])\n",
+    "            else:\n",
+    "                print(\"other= %s\" %(w))            "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}