diff --git a/notebooks/stage/hatesonar_analysis.ipynb b/notebooks/stage/hatesonar_analysis.ipynb
new file mode 100644
index 0000000..95741a0
--- /dev/null
+++ b/notebooks/stage/hatesonar_analysis.ipynb
@@ -0,0 +1,1272 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# HateSonar Analysis\n",
+ "\n",
+ "This notebook ingests the preprocessed data from `../interim/metadata` downloaded by `download_datasets.ipynb` and quantifies the levels of hate speech or offensive language in each of the emails.\n",
+ "\n",
+ "Finally, the analyses are merged and saved as a single csv file that is pushed to remote storage.\n",
+ "\n",
+ "HateSonar identifies the weight that a text matches three different categories: hate speech, offensive language, or neither and gives the top result. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-04-21T17:36:25.292535Z",
+ "start_time": "2021-04-21T17:36:16.847951Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: hatesonar in /opt/app-root/lib/python3.6/site-packages (0.0.7)\n",
+ "Requirement already satisfied: scipy>=1.0.0 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (1.5.4)\n",
+ "Requirement already satisfied: numpy>=1.14.0 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (1.19.5)\n",
+ "Requirement already satisfied: pandas>=0.22.0 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (1.1.5)\n",
+ "Requirement already satisfied: joblib>=0.16.0 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (1.0.1)\n",
+ "Requirement already satisfied: scikit-learn>=0.19.1 in /opt/app-root/lib/python3.6/site-packages (from hatesonar) (0.20.3)\n",
+ "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/app-root/lib/python3.6/site-packages (from pandas>=0.22.0->hatesonar) (2.8.1)\n",
+ "Requirement already satisfied: pytz>=2017.2 in /opt/app-root/lib/python3.6/site-packages (from pandas>=0.22.0->hatesonar) (2021.1)\n",
+ "Requirement already satisfied: six>=1.5 in /opt/app-root/lib/python3.6/site-packages (from python-dateutil>=2.7.3->pandas>=0.22.0->hatesonar) (1.15.0)\n",
+ "Requirement already satisfied: scikit-learn==0.20.3 in /opt/app-root/lib/python3.6/site-packages (0.20.3)\n",
+ "Requirement already satisfied: scipy>=0.13.3 in /opt/app-root/lib/python3.6/site-packages (from scikit-learn==0.20.3) (1.5.4)\n",
+ "Requirement already satisfied: numpy>=1.8.2 in /opt/app-root/lib/python3.6/site-packages (from scikit-learn==0.20.3) (1.19.5)\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install hatesonar\n",
+ "!pip install scikit-learn==0.20.3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-04-21T17:36:26.131919Z",
+ "start_time": "2021-04-21T17:36:25.295426Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import os\n",
+ "import re\n",
+ "import datetime\n",
+ "from pathlib import Path\n",
+ "from dotenv import load_dotenv\n",
+ "from hatesonar import Sonar\n",
+ "\n",
+ "load_dotenv(\"../../.env\")\n",
+ "\n",
+ "import sys\n",
+ "\n",
+ "sys.path.append(\"../..\")\n",
+ "from src import utils\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:23:02.485154Z",
+ "start_time": "2021-03-30T19:23:02.478438Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "BASE_PATH = os.getenv(\"LOCAL_DATA_PATH\", \"../../data/\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:23:17.220171Z",
+ "start_time": "2021-03-30T19:23:03.323633Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "df = utils.load_dataset(f\"{BASE_PATH}/interim/text/\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:23:17.239602Z",
+ "start_time": "2021-03-30T19:23:17.222644Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Message-ID | \n",
+ " Date | \n",
+ " Body | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " <23f4b2992626d689b84a704a575d974cc794709e.came... | \n",
+ " Fri, 31 Jul 2020 18:41:49 -0600 | \n",
+ " ['On Fri, 2020-07-31 at 19:26 +0100, Richard W... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " <CAB_b4sBOn9Bisre7D3pUrDmH9+3unoP5VaeRGi031ks3... | \n",
+ " Sat, 01 Aug 2020 11:07:52 +0800 | \n",
+ " ['Jerry James <loganjerry(a)gmail.com> =E4=BA=... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " <CAJP_izdx=xTviDd4piWMLvxua7Ti8wD81kwqFEB7ucbG... | \n",
+ " Sat, 01 Aug 2020 03:25:48 -0400 | \n",
+ " ['libcroco was retired on Rawhide, but the lib... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " <rg3f65$16fd$1@ciao.gmane.io> | \n",
+ " Sat, 01 Aug 2020 12:12:21 +0200 | \n",
+ " ['Hi,\\n\\nseeing the amount of fallout from LTO... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " <rg3fi2$ipa$1@ciao.gmane.io> | \n",
+ " Sat, 01 Aug 2020 12:18:41 +0200 | \n",
+ " ['Neal Gompa wrote:\\n> I think it does have va... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Message-ID \\\n",
+ "0 <23f4b2992626d689b84a704a575d974cc794709e.came... \n",
+ "1 \n",
+ "4 \n",
+ "\n",
+ " Date \\\n",
+ "0 Fri, 31 Jul 2020 18:41:49 -0600 \n",
+ "1 Sat, 01 Aug 2020 11:07:52 +0800 \n",
+ "2 Sat, 01 Aug 2020 03:25:48 -0400 \n",
+ "3 Sat, 01 Aug 2020 12:12:21 +0200 \n",
+ "4 Sat, 01 Aug 2020 12:18:41 +0200 \n",
+ "\n",
+ " Body \n",
+ "0 ['On Fri, 2020-07-31 at 19:26 +0100, Richard W... \n",
+ "1 ['Jerry James =E4=BA=... \n",
+ "2 ['libcroco was retired on Rawhide, but the lib... \n",
+ "3 ['Hi,\\n\\nseeing the amount of fallout from LTO... \n",
+ "4 ['Neal Gompa wrote:\\n> I think it does have va... "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Text Preprocessing\n",
+ "\n",
+ "Due to the casual nature of email writing, along with some known useless artifacts present in our textual dataset, we need to clean our data a bit before performing our analysis. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:23:17.270091Z",
+ "start_time": "2021-03-30T19:23:17.241664Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def strip_thread(text):\n",
+ " text = text.replace(\"\\r\", \"\")\n",
+ " lines = text.split(\"\\n\")\n",
+ " lines = [line for line in lines if len(line) > 0]\n",
+ " lines = [line for line in lines if line[0] != \">\"]\n",
+ " lines = [line for line in lines if line[:3] != \"Re:\"]\n",
+ " lines = [line for line in lines if line[:7] != \"Subject\"]\n",
+ " lines = [line for line in lines if line[:5] != \"From:\"]\n",
+ " lines = [line for line in lines if line[:5] != \"Date:\"]\n",
+ " lines = [line for line in lines if \"BEGIN PGP SIGNED MESSAGE\" not in line]\n",
+ " lines = [line for line in lines if line[:5] != \"Hash:\"]\n",
+ " lines = [line for line in lines if line[:10] != \"Version: G\"]\n",
+ " lines = [line for line in lines if \"wrote:\" not in line]\n",
+ " lines = [line for line in lines if \"wrote :\" not in line]\n",
+ " lines = [line for line in lines if \"writes:\" not in line]\n",
+ " lines = [line for line in lines if line[:7] != \"Am Mit,\"]\n",
+ " lines = [line for line in lines if line[:7] != \"Am Don,\"]\n",
+ " lines = [line for line in lines if line[:7] != \"Am Mon,\"]\n",
+ " lines = [line for line in lines if line[:7] != \"Quoting\"]\n",
+ " lines = [line for line in lines if line[:10] != \"Em Quinta,\"]\n",
+ " lines = [line for line in lines if \"said:\" not in line]\n",
+ " lines = [\n",
+ " line\n",
+ " for line in lines\n",
+ " if re.match(\n",
+ " \".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), .. (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) 20..*\",\n",
+ " line,\n",
+ " )\n",
+ " is None\n",
+ " ]\n",
+ " lines = [\n",
+ " line\n",
+ " for line in lines\n",
+ " if re.match(\n",
+ " (\n",
+ " \".*n (Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday) ..\"\n",
+ " \" (January|February|March|April|May|June|July|August|September|October|November|December) 20..*\"\n",
+ " ),\n",
+ " line,\n",
+ " )\n",
+ " is None\n",
+ " ]\n",
+ " lines = [\n",
+ " line\n",
+ " for line in lines\n",
+ " if re.match(\n",
+ " \".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) .., 20..*\",\n",
+ " line,\n",
+ " )\n",
+ " is None\n",
+ " ]\n",
+ " lines = [\n",
+ " line\n",
+ " for line in lines\n",
+ " if re.match(\n",
+ " r\".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), 20[\\d]{2}-[\\d]{2}-[\\d]{2} at.*\",\n",
+ " line,\n",
+ " )\n",
+ " is None\n",
+ " ]\n",
+ " lines = [line for line in lines if line[-6:] != \"said: \"]\n",
+ " lines = [line for line in lines if line[-8:] != \"babbled:\"]\n",
+ " lines = [line for line in lines if line[-7:] != \"wrot=e:\"]\n",
+ " lines = [line for line in lines if line[-8:] != \"A9crit :\"]\n",
+ " lines = [line for line in lines if line[0] != \"|\"]\n",
+ " return \"\\n\".join(lines)\n",
+ "\n",
+ "\n",
+ "# format for CSV, clean special characters, and remove extranous emails\n",
+ "def pandas_clean(emails):\n",
+ " emails[\"Body\"].replace(\n",
+ " to_replace=[\n",
+ " r\"\\n\",\n",
+ " \"\\n\",\n",
+ " ],\n",
+ " value=\" \",\n",
+ " regex=True,\n",
+ " inplace=True,\n",
+ " )\n",
+ " emails[\"Body\"].replace(\n",
+ " to_replace=[r\"\\'\", \"'\", \">\", \"<\", \"= \", \"-\", r\"http\\S+\"],\n",
+ " value=\"\",\n",
+ " regex=True,\n",
+ " inplace=True,\n",
+ " )\n",
+ " emails[\"Body\"].replace(\n",
+ " to_replace=[r\"\\\\\\s+\", r\"\\\\s+\", \"=\"], value=\"\", regex=True, inplace=True\n",
+ " )\n",
+ " emails[\"Body\"].replace(\n",
+ " to_replace=[\" \", \" \"], value=\" \", regex=True, inplace=True\n",
+ " )\n",
+ " emails[\"Body\"].replace(\n",
+ " to_replace=[\"_\", \"3D\"], value=\"\", regex=True, inplace=True\n",
+ " )\n",
+ " emails[\"Body\"].replace(\n",
+ " to_replace=[\" \", \" \"], value=\" \", regex=True, inplace=True\n",
+ " )\n",
+ " emails[\"Body\"].replace(\n",
+ " to_replace=[\" \", \" \"], value=\" \", regex=True, inplace=True\n",
+ " )\n",
+ " emails[\"Body\"] = emails[\"Body\"].apply(\n",
+ " lambda x: x.strip().replace(r\"\\n\", \"\")\n",
+ " )\n",
+ "\n",
+ " emails.drop(emails.index[emails[\"Body\"] == \"\"], inplace=True)\n",
+ " emails.drop(emails.index[emails[\"Body\"] == \" \"], inplace=True)\n",
+ " emails.dropna(subset=[\"Body\"], inplace=True)\n",
+ "\n",
+ " emails = emails.reset_index()\n",
+ " emails.drop(\"index\", axis=1, inplace=True)\n",
+ " return emails"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:23:29.264470Z",
+ "start_time": "2021-03-30T19:23:17.271945Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Message-ID | \n",
+ " Date | \n",
+ " Body | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " <CAB_b4sBOn9Bisre7D3pUrDmH9+3unoP5VaeRGi031ks3... | \n",
+ " Sat, 01 Aug 2020 11:07:52 +0800 | \n",
+ " [Jerry James loganjerry(a)gmail.com E4BA8E 202... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " <CAJP_izdx=xTviDd4piWMLvxua7Ti8wD81kwqFEB7ucbG... | \n",
+ " Sat, 01 Aug 2020 03:25:48 -0400 | \n",
+ " [libcroco was retired on Rawhide, but the libc... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " <rg3f65$16fd$1@ciao.gmane.io> | \n",
+ " Sat, 01 Aug 2020 12:12:21 +0200 | \n",
+ " [Hi,seeing the amount of fallout from LTO, I r... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " <20200801121236.4381.17318@mailman01.iad2.fedo... | \n",
+ " Sat, 01 Aug 2020 12:12:36 +0000 | \n",
+ " [Well, that second mass rebuild made things wo... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " <D15334F0-3457-42A9-8E18-601002F1302D@barrys-e... | \n",
+ " Sat, 01 Aug 2020 13:24:13 +0100 | \n",
+ " [\"I see that this ticket is still NEW.Ive upda... | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 14697 | \n",
+ " <20190329164043.GA10522@branched-composer.phx2... | \n",
+ " Fri, 29 Mar 2019 16:40:43 +0000 | \n",
+ " [OLD: Fedora3020190326.n.0NEW: Fedora302019032... | \n",
+ "
\n",
+ " \n",
+ " 14698 | \n",
+ " <20190329173043.DA4F76079248@bastion01.phx2.fe... | \n",
+ " Fri, 29 Mar 2019 17:30:43 +0000 | \n",
+ " [Missing expected images:Atomichost rawxz x866... | \n",
+ "
\n",
+ " \n",
+ " 14699 | \n",
+ " <654338f6-25fe-37fd-9101-c095e9200545@doubledo... | \n",
+ " Fri, 29 Mar 2019 14:47:35 -0400 | \n",
+ " [\"I know its not unusual to carry builds over ... | \n",
+ "
\n",
+ " \n",
+ " 14700 | \n",
+ " <cd084ec7-bda8-57c0-c1f2-ea7f2c48f335@redhat.com> | \n",
+ " Fri, 29 Mar 2019 19:58:33 +0100 | \n",
+ " [\"Dne 29. 03. 19 v 19:47 John Florian napsal(a... | \n",
+ "
\n",
+ " \n",
+ " 14701 | \n",
+ " <CAB-QmhR1UHz1_KUh0P_H=+ZpFxkZCJo4mVx4H7FeD9G8... | \n",
+ " Fri, 29 Mar 2019 22:25:23 +0100 | \n",
+ " [Hi everybody,It looks like the first round of... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
14702 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Message-ID \\\n",
+ "0 \n",
+ "3 <20200801121236.4381.17318@mailman01.iad2.fedo... \n",
+ "4 \n",
+ "14701 \n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Message-ID | \n",
+ " Date | \n",
+ " Body | \n",
+ " Chunk | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " <20180101220004.0632660A400B@fedocal02.phx2.fe... | \n",
+ " 2018-01-01 22:00:04+00:00 | \n",
+ " [Dear all,You are kindly invited to the meetin... | \n",
+ " 2018-01-01 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " <20180101220004.0E97560A400C@fedocal02.phx2.fe... | \n",
+ " 2018-01-01 22:00:04+00:00 | \n",
+ " [Dear all,You are kindly invited to the meetin... | \n",
+ " 2018-01-01 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " <20180101221314.GA52721@rawhide-composer.phx2.... | \n",
+ " 2018-01-01 22:13:15+00:00 | \n",
+ " [OLD: FedoraRawhide20171231.n.0NEW: FedoraRawh... | \n",
+ " 2018-01-01 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " <20180101233509.D734E60478E3@bastion01.phx2.fe... | \n",
+ " 2018-01-01 23:35:09+00:00 | \n",
+ " [Missing expected images:Server dvd i386Workst... | \n",
+ " 2018-01-01 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " <66075732-52f6-2eb8-de1b-d89ec18244db@redhat.com> | \n",
+ " 2018-01-02 10:26:51+01:00 | \n",
+ " [\"Could you please drop the dependency on GCC ... | \n",
+ " 2018-01-01 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ " Message-ID \\\n",
+ "0 <20180101220004.0632660A400B@fedocal02.phx2.fe... \n",
+ "1 <20180101220004.0E97560A400C@fedocal02.phx2.fe... \n",
+ "2 <20180101221314.GA52721@rawhide-composer.phx2.... \n",
+ "3 <20180101233509.D734E60478E3@bastion01.phx2.fe... \n",
+ "4 <66075732-52f6-2eb8-de1b-d89ec18244db@redhat.com> \n",
+ "\n",
+ " Date \\\n",
+ "0 2018-01-01 22:00:04+00:00 \n",
+ "1 2018-01-01 22:00:04+00:00 \n",
+ "2 2018-01-01 22:13:15+00:00 \n",
+ "3 2018-01-01 23:35:09+00:00 \n",
+ "4 2018-01-02 10:26:51+01:00 \n",
+ "\n",
+ " Body Chunk \n",
+ "0 [Dear all,You are kindly invited to the meetin... 2018-01-01 \n",
+ "1 [Dear all,You are kindly invited to the meetin... 2018-01-01 \n",
+ "2 [OLD: FedoraRawhide20171231.n.0NEW: FedoraRawh... 2018-01-01 \n",
+ "3 [Missing expected images:Server dvd i386Workst... 2018-01-01 \n",
+ "4 [\"Could you please drop the dependency on GCC ... 2018-01-01 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clean[\"Date\"] = clean[\"Date\"].apply(lambda x: pd.to_datetime(x))\n",
+ "clean[\"Chunk\"] = clean[\"Date\"].apply(\n",
+ " lambda x: datetime.date(x.year, x.month, 1)\n",
+ ")\n",
+ "clean = clean.sort_values(by=\"Date\")\n",
+ "clean.reset_index(inplace=True, drop=True)\n",
+ "clean.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:23:35.168018Z",
+ "start_time": "2021-03-30T19:23:35.158141Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Message-ID | \n",
+ " Date | \n",
+ " Body | \n",
+ " Chunk | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 14697 | \n",
+ " <20210227161758.B43EC304C540@bastion01.iad2.fe... | \n",
+ " 2021-02-27 16:17:58+00:00 | \n",
+ " [No missing expected images.Compose FAILS prop... | \n",
+ " 2021-02-01 | \n",
+ "
\n",
+ " \n",
+ " 14698 | \n",
+ " <20210227183412.4CCC7307262F@bastion01.iad2.fe... | \n",
+ " 2021-02-27 18:34:12+00:00 | \n",
+ " [No missing expected images.Failed openQA test... | \n",
+ " 2021-02-01 | \n",
+ "
\n",
+ " \n",
+ " 14699 | \n",
+ " <346ef226-3317-c310-d80c-283e4cc7dc2d@redhat.com> | \n",
+ " 2021-02-27 20:30:45+01:00 | \n",
+ " [Hi Benjamin, Ray,I noticed this problem while... | \n",
+ " 2021-02-01 | \n",
+ "
\n",
+ " \n",
+ " 14700 | \n",
+ " <8dee2ff2-e118-bdb2-5d77-20ca82759727@gmail.com> | \n",
+ " 2021-02-27 20:59:59+01:00 | \n",
+ " [Hi,I am trying to test some Renoir s2idle pat... | \n",
+ " 2021-02-01 | \n",
+ "
\n",
+ " \n",
+ " 14701 | \n",
+ " <4199adc3-49c8-4d3d-d768-84327df177fa@gmail.com> | \n",
+ " 2021-02-27 18:56:52-05:00 | \n",
+ " [The assimp license field for version 5.0.1 ha... | \n",
+ " 2021-02-01 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Message-ID \\\n",
+ "14697 <20210227161758.B43EC304C540@bastion01.iad2.fe... \n",
+ "14698 <20210227183412.4CCC7307262F@bastion01.iad2.fe... \n",
+ "14699 <346ef226-3317-c310-d80c-283e4cc7dc2d@redhat.com> \n",
+ "14700 <8dee2ff2-e118-bdb2-5d77-20ca82759727@gmail.com> \n",
+ "14701 <4199adc3-49c8-4d3d-d768-84327df177fa@gmail.com> \n",
+ "\n",
+ " Date \\\n",
+ "14697 2021-02-27 16:17:58+00:00 \n",
+ "14698 2021-02-27 18:34:12+00:00 \n",
+ "14699 2021-02-27 20:30:45+01:00 \n",
+ "14700 2021-02-27 20:59:59+01:00 \n",
+ "14701 2021-02-27 18:56:52-05:00 \n",
+ "\n",
+ " Body Chunk \n",
+ "14697 [No missing expected images.Compose FAILS prop... 2021-02-01 \n",
+ "14698 [No missing expected images.Failed openQA test... 2021-02-01 \n",
+ "14699 [Hi Benjamin, Ray,I noticed this problem while... 2021-02-01 \n",
+ "14700 [Hi,I am trying to test some Renoir s2idle pat... 2021-02-01 \n",
+ "14701 [The assimp license field for version 5.0.1 ha... 2021-02-01 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clean.tail()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-16T15:31:44.854856Z",
+ "start_time": "2021-03-16T15:31:44.848830Z"
+ }
+ },
+ "source": [
+ "## Hate sonar snalysis on whole dataset\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-04-21T17:36:42.144123Z",
+ "start_time": "2021-04-21T17:36:40.554681Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "sonar = Sonar()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:24:02.248990Z",
+ "start_time": "2021-03-30T19:24:02.235702Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def speech(n):\n",
+ " # sonar = Sonar()\n",
+ " t = sonar.ping(text=n)\n",
+ " top = t[\"top_class\"]\n",
+ " hate = t[\"classes\"][0][\"confidence\"]\n",
+ " off = t[\"classes\"][1][\"confidence\"]\n",
+ " neither = t[\"classes\"][2][\"confidence\"]\n",
+ " return [top, hate, off, neither]\n",
+ "\n",
+ "\n",
+ "def get_val(val):\n",
+ " return val[loc]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:24:28.655553Z",
+ "start_time": "2021-03-30T19:24:02.963496Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "clean[\"sonar\"] = clean[\"Body\"].apply(speech)\n",
+ "loc = 0\n",
+ "clean[\"Top\"] = clean[\"sonar\"].apply(get_val)\n",
+ "loc = 1\n",
+ "clean[\"Hate Speech\"] = clean[\"sonar\"].apply(get_val)\n",
+ "loc = 2\n",
+ "clean[\"Offensive Language\"] = clean[\"sonar\"].apply(get_val)\n",
+ "loc = 3\n",
+ "clean[\"Neither\"] = clean[\"sonar\"].apply(get_val)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:24:28.682796Z",
+ "start_time": "2021-03-30T19:24:28.657948Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Message-ID | \n",
+ " Date | \n",
+ " Body | \n",
+ " Chunk | \n",
+ " sonar | \n",
+ " Top | \n",
+ " Hate Speech | \n",
+ " Offensive Language | \n",
+ " Neither | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " <20180101220004.0632660A400B@fedocal02.phx2.fe... | \n",
+ " 2018-01-01 22:00:04+00:00 | \n",
+ " [Dear all,You are kindly invited to the meetin... | \n",
+ " 2018-01-01 | \n",
+ " [neither, 0.07996127979422231, 0.3331293663946... | \n",
+ " neither | \n",
+ " 0.079961 | \n",
+ " 0.333129 | \n",
+ " 0.586909 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " <20180101220004.0E97560A400C@fedocal02.phx2.fe... | \n",
+ " 2018-01-01 22:00:04+00:00 | \n",
+ " [Dear all,You are kindly invited to the meetin... | \n",
+ " 2018-01-01 | \n",
+ " [neither, 0.08164312982342418, 0.3330956077948... | \n",
+ " neither | \n",
+ " 0.081643 | \n",
+ " 0.333096 | \n",
+ " 0.585261 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " <20180101221314.GA52721@rawhide-composer.phx2.... | \n",
+ " 2018-01-01 22:13:15+00:00 | \n",
+ " [OLD: FedoraRawhide20171231.n.0NEW: FedoraRawh... | \n",
+ " 2018-01-01 | \n",
+ " [neither, 0.03325657099633886, 0.3733650971099... | \n",
+ " neither | \n",
+ " 0.033257 | \n",
+ " 0.373365 | \n",
+ " 0.593378 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " <20180101233509.D734E60478E3@bastion01.phx2.fe... | \n",
+ " 2018-01-01 23:35:09+00:00 | \n",
+ " [Missing expected images:Server dvd i386Workst... | \n",
+ " 2018-01-01 | \n",
+ " [neither, 0.039981707371010575, 0.326850382054... | \n",
+ " neither | \n",
+ " 0.039982 | \n",
+ " 0.326850 | \n",
+ " 0.633168 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " <66075732-52f6-2eb8-de1b-d89ec18244db@redhat.com> | \n",
+ " 2018-01-02 10:26:51+01:00 | \n",
+ " [\"Could you please drop the dependency on GCC ... | \n",
+ " 2018-01-01 | \n",
+ " [neither, 0.04388574198143961, 0.4128345886699... | \n",
+ " neither | \n",
+ " 0.043886 | \n",
+ " 0.412835 | \n",
+ " 0.543280 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Message-ID \\\n",
+ "0 <20180101220004.0632660A400B@fedocal02.phx2.fe... \n",
+ "1 <20180101220004.0E97560A400C@fedocal02.phx2.fe... \n",
+ "2 <20180101221314.GA52721@rawhide-composer.phx2.... \n",
+ "3 <20180101233509.D734E60478E3@bastion01.phx2.fe... \n",
+ "4 <66075732-52f6-2eb8-de1b-d89ec18244db@redhat.com> \n",
+ "\n",
+ " Date \\\n",
+ "0 2018-01-01 22:00:04+00:00 \n",
+ "1 2018-01-01 22:00:04+00:00 \n",
+ "2 2018-01-01 22:13:15+00:00 \n",
+ "3 2018-01-01 23:35:09+00:00 \n",
+ "4 2018-01-02 10:26:51+01:00 \n",
+ "\n",
+ " Body Chunk \\\n",
+ "0 [Dear all,You are kindly invited to the meetin... 2018-01-01 \n",
+ "1 [Dear all,You are kindly invited to the meetin... 2018-01-01 \n",
+ "2 [OLD: FedoraRawhide20171231.n.0NEW: FedoraRawh... 2018-01-01 \n",
+ "3 [Missing expected images:Server dvd i386Workst... 2018-01-01 \n",
+ "4 [\"Could you please drop the dependency on GCC ... 2018-01-01 \n",
+ "\n",
+ " sonar Top Hate Speech \\\n",
+ "0 [neither, 0.07996127979422231, 0.3331293663946... neither 0.079961 \n",
+ "1 [neither, 0.08164312982342418, 0.3330956077948... neither 0.081643 \n",
+ "2 [neither, 0.03325657099633886, 0.3733650971099... neither 0.033257 \n",
+ "3 [neither, 0.039981707371010575, 0.326850382054... neither 0.039982 \n",
+ "4 [neither, 0.04388574198143961, 0.4128345886699... neither 0.043886 \n",
+ "\n",
+ " Offensive Language Neither \n",
+ "0 0.333129 0.586909 \n",
+ "1 0.333096 0.585261 \n",
+ "2 0.373365 0.593378 \n",
+ "3 0.326850 0.633168 \n",
+ "4 0.412835 0.543280 "
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clean.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-16T15:18:43.038824Z",
+ "start_time": "2021-03-16T15:18:43.031793Z"
+ }
+ },
+ "source": [
+ "### Offensive Lanuage classification\n",
+ "\n",
+ "From high level anaylsis, it seems like multiple of the messages flagged either had a lot of excess text (most likely from links) or had more direct lanaguage when explaining issues "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2021-03-30T19:24:28.967131Z",
+ "start_time": "2021-03-30T19:24:28.684843Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Message-ID | \n",
+ " Date | \n",
+ " Body | \n",
+ " Chunk | \n",
+ " sonar | \n",
+ " Top | \n",
+ " Hate Speech | \n",
+ " Offensive Language | \n",
+ " Neither | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 316 | \n",
+ " <6f26913b-d3cd-7ef2-000e-9f5931db179b@redhat.com> | \n",
+ " 2018-01-25 09:26:52+01:00 | \n",
+ " [Just to illustrate what this is about, these ... | \n",
+ " 2018-01-01 | \n",
+ " [offensive_language, 0.055735411386999514, 0.4... | \n",
+ " offensive_language | \n",
+ " 0.055735 | \n",
+ " 0.474144 | \n",
+ " 0.470121 | \n",
+ "
\n",
+ " \n",
+ " 421 | \n",
+ " <ufabmh7arll.fsf@epithumnia.math.uh.edu> | \n",
+ " 2018-02-02 17:39:50-06:00 | \n",
+ " [\"Actually comprehending your message, I see i... | \n",
+ " 2018-02-01 | \n",
+ " [offensive_language, 0.05987457809697904, 0.47... | \n",
+ " offensive_language | \n",
+ " 0.059875 | \n",
+ " 0.476603 | \n",
+ " 0.463522 | \n",
+ "
\n",
+ " \n",
+ " 517 | \n",
+ " <CALC7GWx5vt10tK9m4PajtnEZN6kqNDE+4m==MTJq_8Dr... | \n",
+ " 2018-02-13 02:00:46+01:00 | \n",
+ " [I don\\t think, removing the changelog entirel... | \n",
+ " 2018-02-01 | \n",
+ " [offensive_language, 0.0594192768366574, 0.484... | \n",
+ " offensive_language | \n",
+ " 0.059419 | \n",
+ " 0.484444 | \n",
+ " 0.456137 | \n",
+ "
\n",
+ " \n",
+ " 611 | \n",
+ " <20180218173857.12956.59900@mailman01.phx2.fed... | \n",
+ " 2018-02-18 17:38:57+00:00 | \n",
+ " [ If you fixed package(s), Just to make sure: ... | \n",
+ " 2018-02-01 | \n",
+ " [offensive_language, 0.03715599088459851, 0.49... | \n",
+ " offensive_language | \n",
+ " 0.037156 | \n",
+ " 0.491301 | \n",
+ " 0.471543 | \n",
+ "
\n",
+ " \n",
+ " 616 | \n",
+ " <CABB28CxRa5NdyPp76wA88FRQm1rc8=A5TQgonhu1f+oQ... | \n",
+ " 2018-02-18 20:50:06+00:00 | \n",
+ " [\"On 18 February 2018 at 18:06, Stephen John S... | \n",
+ " 2018-02-01 | \n",
+ " [offensive_language, 0.056053959942993656, 0.4... | \n",
+ " offensive_language | \n",
+ " 0.056054 | \n",
+ " 0.478286 | \n",
+ " 0.465661 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Message-ID \\\n",
+ "316 <6f26913b-d3cd-7ef2-000e-9f5931db179b@redhat.com> \n",
+ "421 \n",
+ "517