diff --git a/notebooks/stage/hatesonar_analysis.ipynb b/notebooks/stage/hatesonar_analysis.ipynb new file mode 100644 index 0000000..95741a0 --- /dev/null +++ b/notebooks/stage/hatesonar_analysis.ipynb @@ -0,0 +1,1272 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HateSonar Analysis\n", + "\n", + "This notebook ingests the preprocessed data from `../interim/metadata` downloaded by `download_datasets.ipynb` and quantifies the levels of hate speech or offensive language in each of the emails.\n", + "\n", + "Finally, the analyses are merged and saved as a single csv file that is pushed to remote storage.\n", + "\n", + "HateSonar identifies the weight that a text matches three different categories: hate speech, offensive language, or neither and gives the top result. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2021-04-21T17:36:25.292535Z", + "start_time": "2021-04-21T17:36:16.847951Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: hatesonar in /opt/app-root/lib/python3.6/site-packages (0.0.7)\n", + "Requirement already satisfied: scipy>=1.0.0 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (1.5.4)\n", + "Requirement already satisfied: numpy>=1.14.0 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (1.19.5)\n", + "Requirement already satisfied: pandas>=0.22.0 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (1.1.5)\n", + "Requirement already satisfied: joblib>=0.16.0 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (1.0.1)\n", + "Requirement already satisfied: scikit-learn>=0.19.1 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (0.20.3)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/app-root/lib/python3.6/site-packages (from pandas>=0.22.0->hatesonar) (2.8.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /opt/app-root/lib/python3.6/site-packages (from pandas>=0.22.0->hatesonar) (2021.1)\n", + "Requirement already satisfied: six>=1.5 in /opt/app-root/lib/python3.6/site-packages (from python-dateutil>=2.7.3->pandas>=0.22.0->hatesonar) (1.15.0)\n", + "Requirement already satisfied: scikit-learn==0.20.3 in /opt/app-root/lib/python3.6/site-packages (0.20.3)\n", + "Requirement already satisfied: scipy>=0.13.3 in /opt/app-root/lib/python3.6/site-packages (from scikit-learn==0.20.3) (1.5.4)\n", + "Requirement already satisfied: numpy>=1.8.2 in /opt/app-root/lib/python3.6/site-packages (from scikit-learn==0.20.3) (1.19.5)\n" + ] + } + ], + "source": [ + "!pip install hatesonar\n", + "!pip install scikit-learn==0.20.3" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2021-04-21T17:36:26.131919Z", + "start_time": "2021-04-21T17:36:25.295426Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import re\n", + "import datetime\n", + "from pathlib import Path\n", + "from dotenv import load_dotenv\n", + "from hatesonar import Sonar\n", + "\n", + "load_dotenv(\"../../.env\")\n", + "\n", + "import sys\n", + "\n", + "sys.path.append(\"../..\")\n", + "from src import utils\n", + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:23:02.485154Z", + "start_time": "2021-03-30T19:23:02.478438Z" + } + }, + "outputs": [], + "source": [ + "BASE_PATH = os.getenv(\"LOCAL_DATA_PATH\", \"../../data/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:23:17.220171Z", + "start_time": "2021-03-30T19:23:03.323633Z" + } + }, + "outputs": [], + "source": [ + "df = utils.load_dataset(f\"{BASE_PATH}/interim/text/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:23:17.239602Z", + "start_time": "2021-03-30T19:23:17.222644Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Message-IDDateBody
0<23f4b2992626d689b84a704a575d974cc794709e.came...Fri, 31 Jul 2020 18:41:49 -0600['On Fri, 2020-07-31 at 19:26 +0100, Richard W...
1<CAB_b4sBOn9Bisre7D3pUrDmH9+3unoP5VaeRGi031ks3...Sat, 01 Aug 2020 11:07:52 +0800['Jerry James <loganjerry(a)gmail.com> =E4=BA=...
2<CAJP_izdx=xTviDd4piWMLvxua7Ti8wD81kwqFEB7ucbG...Sat, 01 Aug 2020 03:25:48 -0400['libcroco was retired on Rawhide, but the lib...
3<rg3f65$16fd$1@ciao.gmane.io>Sat, 01 Aug 2020 12:12:21 +0200['Hi,\\n\\nseeing the amount of fallout from LTO...
4<rg3fi2$ipa$1@ciao.gmane.io>Sat, 01 Aug 2020 12:18:41 +0200['Neal Gompa wrote:\\n> I think it does have va...
\n", + "
" + ], + "text/plain": [ + " Message-ID \\\n", + "0 <23f4b2992626d689b84a704a575d974cc794709e.came... \n", + "1 \n", + "4 \n", + "\n", + " Date \\\n", + "0 Fri, 31 Jul 2020 18:41:49 -0600 \n", + "1 Sat, 01 Aug 2020 11:07:52 +0800 \n", + "2 Sat, 01 Aug 2020 03:25:48 -0400 \n", + "3 Sat, 01 Aug 2020 12:12:21 +0200 \n", + "4 Sat, 01 Aug 2020 12:18:41 +0200 \n", + "\n", + " Body \n", + "0 ['On Fri, 2020-07-31 at 19:26 +0100, Richard W... \n", + "1 ['Jerry James =E4=BA=... \n", + "2 ['libcroco was retired on Rawhide, but the lib... \n", + "3 ['Hi,\\n\\nseeing the amount of fallout from LTO... \n", + "4 ['Neal Gompa wrote:\\n> I think it does have va... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text Preprocessing\n", + "\n", + "Due to the casual nature of email writing, along with some known useless artifacts present in our textual dataset, we need to clean our data a bit before performing our analysis. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:23:17.270091Z", + "start_time": "2021-03-30T19:23:17.241664Z" + } + }, + "outputs": [], + "source": [ + "def strip_thread(text):\n", + " text = text.replace(\"\\r\", \"\")\n", + " lines = text.split(\"\\n\")\n", + " lines = [line for line in lines if len(line) > 0]\n", + " lines = [line for line in lines if line[0] != \">\"]\n", + " lines = [line for line in lines if line[:3] != \"Re:\"]\n", + " lines = [line for line in lines if line[:7] != \"Subject\"]\n", + " lines = [line for line in lines if line[:5] != \"From:\"]\n", + " lines = [line for line in lines if line[:5] != \"Date:\"]\n", + " lines = [line for line in lines if \"BEGIN PGP SIGNED MESSAGE\" not in line]\n", + " lines = [line for line in lines if line[:5] != \"Hash:\"]\n", + " lines = [line for line in lines if line[:10] != \"Version: G\"]\n", + " lines = [line for line in lines if \"wrote:\" not in line]\n", + " lines = [line for line in lines if \"wrote :\" not in line]\n", + " lines = [line for line in lines if \"writes:\" not in line]\n", + " lines = [line for line in lines if line[:7] != \"Am Mit,\"]\n", + " lines = [line for line in lines if line[:7] != \"Am Don,\"]\n", + " lines = [line for line in lines if line[:7] != \"Am Mon,\"]\n", + " lines = [line for line in lines if line[:7] != \"Quoting\"]\n", + " lines = [line for line in lines if line[:10] != \"Em Quinta,\"]\n", + " lines = [line for line in lines if \"said:\" not in line]\n", + " lines = [\n", + " line\n", + " for line in lines\n", + " if re.match(\n", + " \".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), .. (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) 20..*\",\n", + " line,\n", + " )\n", + " is None\n", + " ]\n", + " lines = [\n", + " line\n", + " for line in lines\n", + " if re.match(\n", + " (\n", + " \".*n (Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday) ..\"\n", + " \" (January|February|March|April|May|June|July|August|September|October|November|December) 20..*\"\n", + " ),\n", + " line,\n", + " )\n", + " is None\n", + " ]\n", + " lines = [\n", + " line\n", + " for line in lines\n", + " if re.match(\n", + " \".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) .., 20..*\",\n", + " line,\n", + " )\n", + " is None\n", + " ]\n", + " lines = [\n", + " line\n", + " for line in lines\n", + " if re.match(\n", + " r\".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), 20[\\d]{2}-[\\d]{2}-[\\d]{2} at.*\",\n", + " line,\n", + " )\n", + " is None\n", + " ]\n", + " lines = [line for line in lines if line[-6:] != \"said: \"]\n", + " lines = [line for line in lines if line[-8:] != \"babbled:\"]\n", + " lines = [line for line in lines if line[-7:] != \"wrot=e:\"]\n", + " lines = [line for line in lines if line[-8:] != \"A9crit :\"]\n", + " lines = [line for line in lines if line[0] != \"|\"]\n", + " return \"\\n\".join(lines)\n", + "\n", + "\n", + "# format for CSV, clean special characters, and remove extranous emails\n", + "def pandas_clean(emails):\n", + " emails[\"Body\"].replace(\n", + " to_replace=[\n", + " r\"\\n\",\n", + " \"\\n\",\n", + " ],\n", + " value=\" \",\n", + " regex=True,\n", + " inplace=True,\n", + " )\n", + " emails[\"Body\"].replace(\n", + " to_replace=[r\"\\'\", \"'\", \">\", \"<\", \"= \", \"-\", r\"http\\S+\"],\n", + " value=\"\",\n", + " regex=True,\n", + " inplace=True,\n", + " )\n", + " emails[\"Body\"].replace(\n", + " to_replace=[r\"\\\\\\s+\", r\"\\\\s+\", \"=\"], value=\"\", regex=True, inplace=True\n", + " )\n", + " emails[\"Body\"].replace(\n", + " to_replace=[\" \", \" \"], value=\" \", regex=True, inplace=True\n", + " )\n", + " emails[\"Body\"].replace(\n", + " to_replace=[\"_\", \"3D\"], value=\"\", regex=True, inplace=True\n", + " )\n", + " emails[\"Body\"].replace(\n", + " to_replace=[\" \", \" \"], value=\" \", regex=True, inplace=True\n", + " )\n", + " emails[\"Body\"].replace(\n", + " to_replace=[\" \", \" \"], value=\" \", regex=True, inplace=True\n", + " )\n", + " emails[\"Body\"] = emails[\"Body\"].apply(\n", + " lambda x: x.strip().replace(r\"\\n\", \"\")\n", + " )\n", + "\n", + " emails.drop(emails.index[emails[\"Body\"] == \"\"], inplace=True)\n", + " emails.drop(emails.index[emails[\"Body\"] == \" \"], inplace=True)\n", + " emails.dropna(subset=[\"Body\"], inplace=True)\n", + "\n", + " emails = emails.reset_index()\n", + " emails.drop(\"index\", axis=1, inplace=True)\n", + " return emails" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:23:29.264470Z", + "start_time": "2021-03-30T19:23:17.271945Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Message-IDDateBody
0<CAB_b4sBOn9Bisre7D3pUrDmH9+3unoP5VaeRGi031ks3...Sat, 01 Aug 2020 11:07:52 +0800[Jerry James loganjerry(a)gmail.com E4BA8E 202...
1<CAJP_izdx=xTviDd4piWMLvxua7Ti8wD81kwqFEB7ucbG...Sat, 01 Aug 2020 03:25:48 -0400[libcroco was retired on Rawhide, but the libc...
2<rg3f65$16fd$1@ciao.gmane.io>Sat, 01 Aug 2020 12:12:21 +0200[Hi,seeing the amount of fallout from LTO, I r...
3<20200801121236.4381.17318@mailman01.iad2.fedo...Sat, 01 Aug 2020 12:12:36 +0000[Well, that second mass rebuild made things wo...
4<D15334F0-3457-42A9-8E18-601002F1302D@barrys-e...Sat, 01 Aug 2020 13:24:13 +0100[\"I see that this ticket is still NEW.Ive upda...
............
14697<20190329164043.GA10522@branched-composer.phx2...Fri, 29 Mar 2019 16:40:43 +0000[OLD: Fedora3020190326.n.0NEW: Fedora302019032...
14698<20190329173043.DA4F76079248@bastion01.phx2.fe...Fri, 29 Mar 2019 17:30:43 +0000[Missing expected images:Atomichost rawxz x866...
14699<654338f6-25fe-37fd-9101-c095e9200545@doubledo...Fri, 29 Mar 2019 14:47:35 -0400[\"I know its not unusual to carry builds over ...
14700<cd084ec7-bda8-57c0-c1f2-ea7f2c48f335@redhat.com>Fri, 29 Mar 2019 19:58:33 +0100[\"Dne 29. 03. 19 v 19:47 John Florian napsal(a...
14701<CAB-QmhR1UHz1_KUh0P_H=+ZpFxkZCJo4mVx4H7FeD9G8...Fri, 29 Mar 2019 22:25:23 +0100[Hi everybody,It looks like the first round of...
\n", + "

14702 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Message-ID \\\n", + "0 \n", + "3 <20200801121236.4381.17318@mailman01.iad2.fedo... \n", + "4 \n", + "14701 \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Message-IDDateBodyChunk
0<20180101220004.0632660A400B@fedocal02.phx2.fe...2018-01-01 22:00:04+00:00[Dear all,You are kindly invited to the meetin...2018-01-01
1<20180101220004.0E97560A400C@fedocal02.phx2.fe...2018-01-01 22:00:04+00:00[Dear all,You are kindly invited to the meetin...2018-01-01
2<20180101221314.GA52721@rawhide-composer.phx2....2018-01-01 22:13:15+00:00[OLD: FedoraRawhide20171231.n.0NEW: FedoraRawh...2018-01-01
3<20180101233509.D734E60478E3@bastion01.phx2.fe...2018-01-01 23:35:09+00:00[Missing expected images:Server dvd i386Workst...2018-01-01
4<66075732-52f6-2eb8-de1b-d89ec18244db@redhat.com>2018-01-02 10:26:51+01:00[\"Could you please drop the dependency on GCC ...2018-01-01
\n", + "" + ], + "text/plain": [ + " Message-ID \\\n", + "0 <20180101220004.0632660A400B@fedocal02.phx2.fe... \n", + "1 <20180101220004.0E97560A400C@fedocal02.phx2.fe... \n", + "2 <20180101221314.GA52721@rawhide-composer.phx2.... \n", + "3 <20180101233509.D734E60478E3@bastion01.phx2.fe... \n", + "4 <66075732-52f6-2eb8-de1b-d89ec18244db@redhat.com> \n", + "\n", + " Date \\\n", + "0 2018-01-01 22:00:04+00:00 \n", + "1 2018-01-01 22:00:04+00:00 \n", + "2 2018-01-01 22:13:15+00:00 \n", + "3 2018-01-01 23:35:09+00:00 \n", + "4 2018-01-02 10:26:51+01:00 \n", + "\n", + " Body Chunk \n", + "0 [Dear all,You are kindly invited to the meetin... 2018-01-01 \n", + "1 [Dear all,You are kindly invited to the meetin... 2018-01-01 \n", + "2 [OLD: FedoraRawhide20171231.n.0NEW: FedoraRawh... 2018-01-01 \n", + "3 [Missing expected images:Server dvd i386Workst... 2018-01-01 \n", + "4 [\"Could you please drop the dependency on GCC ... 2018-01-01 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean[\"Date\"] = clean[\"Date\"].apply(lambda x: pd.to_datetime(x))\n", + "clean[\"Chunk\"] = clean[\"Date\"].apply(\n", + " lambda x: datetime.date(x.year, x.month, 1)\n", + ")\n", + "clean = clean.sort_values(by=\"Date\")\n", + "clean.reset_index(inplace=True, drop=True)\n", + "clean.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:23:35.168018Z", + "start_time": "2021-03-30T19:23:35.158141Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Message-IDDateBodyChunk
14697<20210227161758.B43EC304C540@bastion01.iad2.fe...2021-02-27 16:17:58+00:00[No missing expected images.Compose FAILS prop...2021-02-01
14698<20210227183412.4CCC7307262F@bastion01.iad2.fe...2021-02-27 18:34:12+00:00[No missing expected images.Failed openQA test...2021-02-01
14699<346ef226-3317-c310-d80c-283e4cc7dc2d@redhat.com>2021-02-27 20:30:45+01:00[Hi Benjamin, Ray,I noticed this problem while...2021-02-01
14700<8dee2ff2-e118-bdb2-5d77-20ca82759727@gmail.com>2021-02-27 20:59:59+01:00[Hi,I am trying to test some Renoir s2idle pat...2021-02-01
14701<4199adc3-49c8-4d3d-d768-84327df177fa@gmail.com>2021-02-27 18:56:52-05:00[The assimp license field for version 5.0.1 ha...2021-02-01
\n", + "
" + ], + "text/plain": [ + " Message-ID \\\n", + "14697 <20210227161758.B43EC304C540@bastion01.iad2.fe... \n", + "14698 <20210227183412.4CCC7307262F@bastion01.iad2.fe... \n", + "14699 <346ef226-3317-c310-d80c-283e4cc7dc2d@redhat.com> \n", + "14700 <8dee2ff2-e118-bdb2-5d77-20ca82759727@gmail.com> \n", + "14701 <4199adc3-49c8-4d3d-d768-84327df177fa@gmail.com> \n", + "\n", + " Date \\\n", + "14697 2021-02-27 16:17:58+00:00 \n", + "14698 2021-02-27 18:34:12+00:00 \n", + "14699 2021-02-27 20:30:45+01:00 \n", + "14700 2021-02-27 20:59:59+01:00 \n", + "14701 2021-02-27 18:56:52-05:00 \n", + "\n", + " Body Chunk \n", + "14697 [No missing expected images.Compose FAILS prop... 2021-02-01 \n", + "14698 [No missing expected images.Failed openQA test... 2021-02-01 \n", + "14699 [Hi Benjamin, Ray,I noticed this problem while... 2021-02-01 \n", + "14700 [Hi,I am trying to test some Renoir s2idle pat... 2021-02-01 \n", + "14701 [The assimp license field for version 5.0.1 ha... 2021-02-01 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-16T15:31:44.854856Z", + "start_time": "2021-03-16T15:31:44.848830Z" + } + }, + "source": [ + "## Hate sonar snalysis on whole dataset\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2021-04-21T17:36:42.144123Z", + "start_time": "2021-04-21T17:36:40.554681Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "sonar = Sonar()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:24:02.248990Z", + "start_time": "2021-03-30T19:24:02.235702Z" + } + }, + "outputs": [], + "source": [ + "def speech(n):\n", + " # sonar = Sonar()\n", + " t = sonar.ping(text=n)\n", + " top = t[\"top_class\"]\n", + " hate = t[\"classes\"][0][\"confidence\"]\n", + " off = t[\"classes\"][1][\"confidence\"]\n", + " neither = t[\"classes\"][2][\"confidence\"]\n", + " return [top, hate, off, neither]\n", + "\n", + "\n", + "def get_val(val):\n", + " return val[loc]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:24:28.655553Z", + "start_time": "2021-03-30T19:24:02.963496Z" + } + }, + "outputs": [], + "source": [ + "clean[\"sonar\"] = clean[\"Body\"].apply(speech)\n", + "loc = 0\n", + "clean[\"Top\"] = clean[\"sonar\"].apply(get_val)\n", + "loc = 1\n", + "clean[\"Hate Speech\"] = clean[\"sonar\"].apply(get_val)\n", + "loc = 2\n", + "clean[\"Offensive Language\"] = clean[\"sonar\"].apply(get_val)\n", + "loc = 3\n", + "clean[\"Neither\"] = clean[\"sonar\"].apply(get_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:24:28.682796Z", + "start_time": "2021-03-30T19:24:28.657948Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Message-IDDateBodyChunksonarTopHate SpeechOffensive LanguageNeither
0<20180101220004.0632660A400B@fedocal02.phx2.fe...2018-01-01 22:00:04+00:00[Dear all,You are kindly invited to the meetin...2018-01-01[neither, 0.07996127979422231, 0.3331293663946...neither0.0799610.3331290.586909
1<20180101220004.0E97560A400C@fedocal02.phx2.fe...2018-01-01 22:00:04+00:00[Dear all,You are kindly invited to the meetin...2018-01-01[neither, 0.08164312982342418, 0.3330956077948...neither0.0816430.3330960.585261
2<20180101221314.GA52721@rawhide-composer.phx2....2018-01-01 22:13:15+00:00[OLD: FedoraRawhide20171231.n.0NEW: FedoraRawh...2018-01-01[neither, 0.03325657099633886, 0.3733650971099...neither0.0332570.3733650.593378
3<20180101233509.D734E60478E3@bastion01.phx2.fe...2018-01-01 23:35:09+00:00[Missing expected images:Server dvd i386Workst...2018-01-01[neither, 0.039981707371010575, 0.326850382054...neither0.0399820.3268500.633168
4<66075732-52f6-2eb8-de1b-d89ec18244db@redhat.com>2018-01-02 10:26:51+01:00[\"Could you please drop the dependency on GCC ...2018-01-01[neither, 0.04388574198143961, 0.4128345886699...neither0.0438860.4128350.543280
\n", + "
" + ], + "text/plain": [ + " Message-ID \\\n", + "0 <20180101220004.0632660A400B@fedocal02.phx2.fe... \n", + "1 <20180101220004.0E97560A400C@fedocal02.phx2.fe... \n", + "2 <20180101221314.GA52721@rawhide-composer.phx2.... \n", + "3 <20180101233509.D734E60478E3@bastion01.phx2.fe... \n", + "4 <66075732-52f6-2eb8-de1b-d89ec18244db@redhat.com> \n", + "\n", + " Date \\\n", + "0 2018-01-01 22:00:04+00:00 \n", + "1 2018-01-01 22:00:04+00:00 \n", + "2 2018-01-01 22:13:15+00:00 \n", + "3 2018-01-01 23:35:09+00:00 \n", + "4 2018-01-02 10:26:51+01:00 \n", + "\n", + " Body Chunk \\\n", + "0 [Dear all,You are kindly invited to the meetin... 2018-01-01 \n", + "1 [Dear all,You are kindly invited to the meetin... 2018-01-01 \n", + "2 [OLD: FedoraRawhide20171231.n.0NEW: FedoraRawh... 2018-01-01 \n", + "3 [Missing expected images:Server dvd i386Workst... 2018-01-01 \n", + "4 [\"Could you please drop the dependency on GCC ... 2018-01-01 \n", + "\n", + " sonar Top Hate Speech \\\n", + "0 [neither, 0.07996127979422231, 0.3331293663946... neither 0.079961 \n", + "1 [neither, 0.08164312982342418, 0.3330956077948... neither 0.081643 \n", + "2 [neither, 0.03325657099633886, 0.3733650971099... neither 0.033257 \n", + "3 [neither, 0.039981707371010575, 0.326850382054... neither 0.039982 \n", + "4 [neither, 0.04388574198143961, 0.4128345886699... neither 0.043886 \n", + "\n", + " Offensive Language Neither \n", + "0 0.333129 0.586909 \n", + "1 0.333096 0.585261 \n", + "2 0.373365 0.593378 \n", + "3 0.326850 0.633168 \n", + "4 0.412835 0.543280 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-16T15:18:43.038824Z", + "start_time": "2021-03-16T15:18:43.031793Z" + } + }, + "source": [ + "### Offensive Lanuage classification\n", + "\n", + "From high level anaylsis, it seems like multiple of the messages flagged either had a lot of excess text (most likely from links) or had more direct lanaguage when explaining issues " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2021-03-30T19:24:28.967131Z", + "start_time": "2021-03-30T19:24:28.684843Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Message-IDDateBodyChunksonarTopHate SpeechOffensive LanguageNeither
316<6f26913b-d3cd-7ef2-000e-9f5931db179b@redhat.com>2018-01-25 09:26:52+01:00[Just to illustrate what this is about, these ...2018-01-01[offensive_language, 0.055735411386999514, 0.4...offensive_language0.0557350.4741440.470121
421<ufabmh7arll.fsf@epithumnia.math.uh.edu>2018-02-02 17:39:50-06:00[\"Actually comprehending your message, I see i...2018-02-01[offensive_language, 0.05987457809697904, 0.47...offensive_language0.0598750.4766030.463522
517<CALC7GWx5vt10tK9m4PajtnEZN6kqNDE+4m==MTJq_8Dr...2018-02-13 02:00:46+01:00[I don\\t think, removing the changelog entirel...2018-02-01[offensive_language, 0.0594192768366574, 0.484...offensive_language0.0594190.4844440.456137
611<20180218173857.12956.59900@mailman01.phx2.fed...2018-02-18 17:38:57+00:00[ If you fixed package(s), Just to make sure: ...2018-02-01[offensive_language, 0.03715599088459851, 0.49...offensive_language0.0371560.4913010.471543
616<CABB28CxRa5NdyPp76wA88FRQm1rc8=A5TQgonhu1f+oQ...2018-02-18 20:50:06+00:00[\"On 18 February 2018 at 18:06, Stephen John S...2018-02-01[offensive_language, 0.056053959942993656, 0.4...offensive_language0.0560540.4782860.465661
\n", + "
" + ], + "text/plain": [ + " Message-ID \\\n", + "316 <6f26913b-d3cd-7ef2-000e-9f5931db179b@redhat.com> \n", + "421 \n", + "517