Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: normalize: clean normalization of text input (#15) #49

Merged
merged 3 commits into from
Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
388 changes: 388 additions & 0 deletions etl/experiments/4.process_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,388 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Necessary imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def remove_space_at_the_end(x: str):\n",
" if x is not None:\n",
" return x.strip()\n",
"\n",
"def replace_double_quote(x: str):\n",
" if x is not None:\n",
" return x.replace(\"\\\"\\\"\", \"'\")\n",
"\n",
"def normalize(data: pd.DataFrame, text_columns):\n",
" data[text_columns] = data[text_columns].apply(\n",
" lambda x: x.apply(remove_space_at_the_end)\n",
" )\n",
"\n",
" data[text_columns] = data[text_columns].apply(\n",
" lambda x: x.apply(replace_double_quote)\n",
" )\n",
"\n",
" data[\"titre\"] = data[\"titre\"].apply(lambda x: x.upper())\n",
" data[\"objet\"] = data[\"objet\"].apply(lambda x: x.lower())\n",
"\n",
" return data\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and viz data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(\"../ref-rna-real-mars-2022-enriched-not-qualified.csv\", index_col=0)\n",
"data = data[data.columns[1:]] # ignore first column it is index not correctly saved"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 1071 entries, 0 to 1070\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 1071 non-null object \n",
" 1 titre 1071 non-null object \n",
" 2 objet 1071 non-null object \n",
" 3 adrs_numvoie 978 non-null object \n",
" 4 adrs_typevoie 989 non-null object \n",
" 5 adrs_libvoie 1015 non-null object \n",
" 6 adrs_codepostal 1070 non-null float64\n",
" 7 adrs_libcommune 1071 non-null object \n",
" 8 siteweb 32 non-null object \n",
" 9 adrs 1071 non-null object \n",
" 10 dept 1071 non-null object \n",
" 11 region 1071 non-null object \n",
" 12 social_object1_libelle 1071 non-null object \n",
" 13 social_object2_libelle 1071 non-null object \n",
" 14 longitude 1071 non-null float64\n",
" 15 latitude 1071 non-null float64\n",
" 16 facebook_url 1071 non-null object \n",
" 17 helloasso_url 1071 non-null object \n",
"dtypes: float64(3), object(15)\n",
"memory usage: 159.0+ KB\n"
]
}
],
"source": [
"data.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>titre</th>\n",
" <th>objet</th>\n",
" <th>adrs_numvoie</th>\n",
" <th>adrs_typevoie</th>\n",
" <th>adrs_libvoie</th>\n",
" <th>adrs_codepostal</th>\n",
" <th>adrs_libcommune</th>\n",
" <th>siteweb</th>\n",
" <th>adrs</th>\n",
" <th>dept</th>\n",
" <th>region</th>\n",
" <th>social_object1_libelle</th>\n",
" <th>social_object2_libelle</th>\n",
" <th>longitude</th>\n",
" <th>latitude</th>\n",
" <th>facebook_url</th>\n",
" <th>helloasso_url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>W751218914</td>\n",
" <td>ASSOCIATION 347 BIS SOLIDARITE LGBT CAMEROUN</td>\n",
" <td>défense des droits humains tels que définis pa...</td>\n",
" <td>51</td>\n",
" <td>AV</td>\n",
" <td>Gambetta</td>\n",
" <td>75020.0</td>\n",
" <td>Paris</td>\n",
" <td>NaN</td>\n",
" <td>51 AV Gambetta 75020 Paris</td>\n",
" <td>Paris</td>\n",
" <td>Île-de-France</td>\n",
" <td>AMICALES/ GROUPEMENTS AFFINITAIRES/ GROUPEMENT...</td>\n",
" <td>AUTRES</td>\n",
" <td>2.394138</td>\n",
" <td>48.864734</td>\n",
" <td>https://www.facebook.com/347Bis</td>\n",
" <td>https://www.helloasso.com/associations/stop-ho...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>424</th>\n",
" <td>W771001914</td>\n",
" <td>STEPH'SON ET LUMIERE</td>\n",
" <td>permettre le développement et l'encadrement d'...</td>\n",
" <td>3</td>\n",
" <td>ALL</td>\n",
" <td>des Moissonneurs</td>\n",
" <td>77500.0</td>\n",
" <td>Chelles</td>\n",
" <td>NaN</td>\n",
" <td>3 ALL des Moissonneurs 77500 Chelles</td>\n",
" <td>Seine-et-Marne</td>\n",
" <td>Île-de-France</td>\n",
" <td>CULTURE/ PRATIQUES D'ACTIVITÉS ARTISTIQUES/ PR...</td>\n",
" <td>AUTRES</td>\n",
" <td>2.611912</td>\n",
" <td>48.887420</td>\n",
" <td>https://www.facebook.com/STEPHSON-LUMIERES-180...</td>\n",
" <td>https://www.helloasso.com/associations/associa...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1032</th>\n",
" <td>W531001378</td>\n",
" <td>MA MAISON EST TA MAISON</td>\n",
" <td>permettre â des personnes habitant au cameroun...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>53290.0</td>\n",
" <td>Bouère</td>\n",
" <td>NaN</td>\n",
" <td>53290 Bouère</td>\n",
" <td>Mayenne</td>\n",
" <td>Pays de la Loire</td>\n",
" <td>AMICALES/ GROUPEMENTS AFFINITAIRES/ GROUPEMENT...</td>\n",
" <td>AUTRES</td>\n",
" <td>-0.479571</td>\n",
" <td>47.863856</td>\n",
" <td>https://www.facebook.com/locksjacky/videos/ma-...</td>\n",
" <td>https://www.helloasso.com/associations/la-maso...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>W751203402</td>\n",
" <td>FORUM CAMEROUNAIS ET AMIS DU CAMEROUN (FOCAC)</td>\n",
" <td>promouvoir avec les jeunes issus de l'immigrat...</td>\n",
" <td>33</td>\n",
" <td>RUE</td>\n",
" <td>Polonceau</td>\n",
" <td>75018.0</td>\n",
" <td>Paris</td>\n",
" <td>NaN</td>\n",
" <td>33 RUE Polonceau 75018 Paris</td>\n",
" <td>Paris</td>\n",
" <td>Île-de-France</td>\n",
" <td>AIDE À L'EMPLOI/ DÉVELOPPEMENT LOCAL/ PROMOTIO...</td>\n",
" <td>AUTRES</td>\n",
" <td>2.352095</td>\n",
" <td>48.885868</td>\n",
" <td>https://www.facebook.com/Mes-amis-du-Cameroun-...</td>\n",
" <td>https://www.helloasso.com/associations/feiac-f...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>370</th>\n",
" <td>W931013378</td>\n",
" <td>FEMMES DYNAMIQUES DU NOUN FDN</td>\n",
" <td>mettre en place des parrainages pour aider les...</td>\n",
" <td>52</td>\n",
" <td>RUE</td>\n",
" <td>Jean Jaurès</td>\n",
" <td>92230.0</td>\n",
" <td>Gennevilliers</td>\n",
" <td>NaN</td>\n",
" <td>52 RUE Jean Jaurès 92230 Gennevilliers</td>\n",
" <td>Hauts-de-Seine</td>\n",
" <td>Île-de-France</td>\n",
" <td>CULTURE/ PRATIQUES D'ACTIVITÉS ARTISTIQUES/ PR...</td>\n",
" <td>AUTRES</td>\n",
" <td>2.294842</td>\n",
" <td>48.928173</td>\n",
" <td>https://fr-fr.facebook.com/dynamiques.femmes</td>\n",
" <td>https://www.helloasso.com/associations/associa...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id titre \\\n",
"53 W751218914 ASSOCIATION 347 BIS SOLIDARITE LGBT CAMEROUN \n",
"424 W771001914 STEPH'SON ET LUMIERE \n",
"1032 W531001378 MA MAISON EST TA MAISON \n",
"27 W751203402 FORUM CAMEROUNAIS ET AMIS DU CAMEROUN (FOCAC) \n",
"370 W931013378 FEMMES DYNAMIQUES DU NOUN FDN \n",
"\n",
" objet adrs_numvoie \\\n",
"53 défense des droits humains tels que définis pa... 51 \n",
"424 permettre le développement et l'encadrement d'... 3 \n",
"1032 permettre â des personnes habitant au cameroun... NaN \n",
"27 promouvoir avec les jeunes issus de l'immigrat... 33 \n",
"370 mettre en place des parrainages pour aider les... 52 \n",
"\n",
" adrs_typevoie adrs_libvoie adrs_codepostal adrs_libcommune siteweb \\\n",
"53 AV Gambetta 75020.0 Paris NaN \n",
"424 ALL des Moissonneurs 77500.0 Chelles NaN \n",
"1032 NaN NaN 53290.0 Bouère NaN \n",
"27 RUE Polonceau 75018.0 Paris NaN \n",
"370 RUE Jean Jaurès 92230.0 Gennevilliers NaN \n",
"\n",
" adrs dept \\\n",
"53 51 AV Gambetta 75020 Paris Paris \n",
"424 3 ALL des Moissonneurs 77500 Chelles Seine-et-Marne \n",
"1032 53290 Bouère Mayenne \n",
"27 33 RUE Polonceau 75018 Paris Paris \n",
"370 52 RUE Jean Jaurès 92230 Gennevilliers Hauts-de-Seine \n",
"\n",
" region social_object1_libelle \\\n",
"53 Île-de-France AMICALES/ GROUPEMENTS AFFINITAIRES/ GROUPEMENT... \n",
"424 Île-de-France CULTURE/ PRATIQUES D'ACTIVITÉS ARTISTIQUES/ PR... \n",
"1032 Pays de la Loire AMICALES/ GROUPEMENTS AFFINITAIRES/ GROUPEMENT... \n",
"27 Île-de-France AIDE À L'EMPLOI/ DÉVELOPPEMENT LOCAL/ PROMOTIO... \n",
"370 Île-de-France CULTURE/ PRATIQUES D'ACTIVITÉS ARTISTIQUES/ PR... \n",
"\n",
" social_object2_libelle longitude latitude \\\n",
"53 AUTRES 2.394138 48.864734 \n",
"424 AUTRES 2.611912 48.887420 \n",
"1032 AUTRES -0.479571 47.863856 \n",
"27 AUTRES 2.352095 48.885868 \n",
"370 AUTRES 2.294842 48.928173 \n",
"\n",
" facebook_url \\\n",
"53 https://www.facebook.com/347Bis \n",
"424 https://www.facebook.com/STEPHSON-LUMIERES-180... \n",
"1032 https://www.facebook.com/locksjacky/videos/ma-... \n",
"27 https://www.facebook.com/Mes-amis-du-Cameroun-... \n",
"370 https://fr-fr.facebook.com/dynamiques.femmes \n",
"\n",
" helloasso_url \n",
"53 https://www.helloasso.com/associations/stop-ho... \n",
"424 https://www.helloasso.com/associations/associa... \n",
"1032 https://www.helloasso.com/associations/la-maso... \n",
"27 https://www.helloasso.com/associations/feiac-f... \n",
"370 https://www.helloasso.com/associations/associa... "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_columns = [\n",
" \"titre\", \"objet\", \"social_object1_libelle\", \"social_object2_libelle\"\n",
"]\n",
"\n",
"data = normalize(data, text_columns)\n",
"data.sample(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save without index"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"filename = '../ref-rna-real-mars-2022-enriched-not-qualified-process'\n",
"data.to_csv(f'./{filename}.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12 ('base')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "d9a8acb4f733d3596df9f6fac9daff15e014d11794ebc65488d1c191c94698fd"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading