diff --git a/emnlp2021/style_transfer/mining_parallel_corpus/finetune_t5_on_mined.ipynb b/emnlp2021/style_transfer/mining_parallel_corpus/finetune_t5_on_mined.ipynb
new file mode 100644
index 0000000..f94bce7
--- /dev/null
+++ b/emnlp2021/style_transfer/mining_parallel_corpus/finetune_t5_on_mined.ipynb
@@ -0,0 +1,2264 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ['CUDA_VISIBLE_DEVICES'] = '1'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Train a paraphraser on the mined data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(223823, 7)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " reference | \n",
+ " translation | \n",
+ " similarity | \n",
+ " lenght_diff | \n",
+ " ref_tox | \n",
+ " trn_tox | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 191205 | \n",
+ " 191205 | \n",
+ " She must be out there somewhere. | \n",
+ " he must be out there, damn it. | \n",
+ " 0.629855 | \n",
+ " 0.060606 | \n",
+ " 0.000061 | \n",
+ " 0.995908 | \n",
+ "
\n",
+ " \n",
+ " 197885 | \n",
+ " 197885 | \n",
+ " \"Nuts! | \n",
+ " \"bullshit! | \n",
+ " 0.743836 | \n",
+ " 0.363636 | \n",
+ " 0.028349 | \n",
+ " 0.999605 | \n",
+ "
\n",
+ " \n",
+ " 143495 | \n",
+ " 143495 | \n",
+ " You are only about one half loser, the other h... | \n",
+ " from the second half, you're the winner. | \n",
+ " 0.778499 | \n",
+ " 0.293103 | \n",
+ " 0.932930 | \n",
+ " 0.000052 | \n",
+ "
\n",
+ " \n",
+ " 112935 | \n",
+ " 112935 | \n",
+ " Kick some NSC booty. | \n",
+ " kick some NSC asses. | \n",
+ " 0.901688 | \n",
+ " 0.000000 | \n",
+ " 0.359050 | \n",
+ " 0.998997 | \n",
+ "
\n",
+ " \n",
+ " 52732 | \n",
+ " 52732 | \n",
+ " Because it's where they used to keep prisoners... | \n",
+ " because they held prisoners there before they ... | \n",
+ " 0.837865 | \n",
+ " 0.162500 | \n",
+ " 0.953626 | \n",
+ " 0.074862 | \n",
+ "
\n",
+ " \n",
+ " 95104 | \n",
+ " 95104 | \n",
+ " He wants us to blow him away! | \n",
+ " he wants us to kill him! | \n",
+ " 0.766668 | \n",
+ " 0.166667 | \n",
+ " 0.000662 | \n",
+ " 0.999194 | \n",
+ "
\n",
+ " \n",
+ " 200609 | \n",
+ " 200609 | \n",
+ " 'You don't bash a man's brains out when he's t... | \n",
+ " \"you can't break the head of a man tied up in ... | \n",
+ " 0.757032 | \n",
+ " 0.111111 | \n",
+ " 0.893770 | \n",
+ " 0.006597 | \n",
+ "
\n",
+ " \n",
+ " 85321 | \n",
+ " 85321 | \n",
+ " \"Here's the bats, if you got the balls\". | \n",
+ " \"take them, if you have the balls!\" | \n",
+ " 0.786242 | \n",
+ " 0.121951 | \n",
+ " 0.000385 | \n",
+ " 0.998311 | \n",
+ "
\n",
+ " \n",
+ " 73313 | \n",
+ " 73313 | \n",
+ " I don't want to fucking hear it. | \n",
+ " I don't want to hear anything. | \n",
+ " 0.828845 | \n",
+ " 0.060606 | \n",
+ " 0.980057 | \n",
+ " 0.000058 | \n",
+ "
\n",
+ " \n",
+ " 146935 | \n",
+ " 146935 | \n",
+ " One, you can be a waitress, or you can be a ca... | \n",
+ " one that you're gonna be a waitress...... or y... | \n",
+ " 0.708999 | \n",
+ " 0.214286 | \n",
+ " 0.000142 | \n",
+ " 0.999406 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 reference \\\n",
+ "191205 191205 She must be out there somewhere. \n",
+ "197885 197885 \"Nuts! \n",
+ "143495 143495 You are only about one half loser, the other h... \n",
+ "112935 112935 Kick some NSC booty. \n",
+ "52732 52732 Because it's where they used to keep prisoners... \n",
+ "95104 95104 He wants us to blow him away! \n",
+ "200609 200609 'You don't bash a man's brains out when he's t... \n",
+ "85321 85321 \"Here's the bats, if you got the balls\". \n",
+ "73313 73313 I don't want to fucking hear it. \n",
+ "146935 146935 One, you can be a waitress, or you can be a ca... \n",
+ "\n",
+ " translation similarity \\\n",
+ "191205 he must be out there, damn it. 0.629855 \n",
+ "197885 \"bullshit! 0.743836 \n",
+ "143495 from the second half, you're the winner. 0.778499 \n",
+ "112935 kick some NSC asses. 0.901688 \n",
+ "52732 because they held prisoners there before they ... 0.837865 \n",
+ "95104 he wants us to kill him! 0.766668 \n",
+ "200609 \"you can't break the head of a man tied up in ... 0.757032 \n",
+ "85321 \"take them, if you have the balls!\" 0.786242 \n",
+ "73313 I don't want to hear anything. 0.828845 \n",
+ "146935 one that you're gonna be a waitress...... or y... 0.708999 \n",
+ "\n",
+ " lenght_diff ref_tox trn_tox \n",
+ "191205 0.060606 0.000061 0.995908 \n",
+ "197885 0.363636 0.028349 0.999605 \n",
+ "143495 0.293103 0.932930 0.000052 \n",
+ "112935 0.000000 0.359050 0.998997 \n",
+ "52732 0.162500 0.953626 0.074862 \n",
+ "95104 0.166667 0.000662 0.999194 \n",
+ "200609 0.111111 0.893770 0.006597 \n",
+ "85321 0.121951 0.000385 0.998311 \n",
+ "73313 0.060606 0.980057 0.000058 \n",
+ "146935 0.214286 0.000142 0.999406 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv('filtered.tsv', sep='\\t', encoding='utf-8')\n",
+ "print(df.shape)\n",
+ "df.sample(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.5521639867216506"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(df.ref_tox > df.trn_tox).mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "xx = []\n",
+ "yy = []\n",
+ "for i, row in df.iterrows():\n",
+ " if row.ref_tox > row.trn_tox:\n",
+ " xx.append(row.reference)\n",
+ " yy.append(row.translation)\n",
+ " else:\n",
+ " yy.append(row.reference)\n",
+ " xx.append(row.translation)\n",
+ " \n",
+ "xydf = pd.DataFrame({'source': xx, 'target': yy})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Prepare datasets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import (\n",
+ " AdamW,\n",
+ " T5ForConditionalGeneration,\n",
+ " T5Tokenizer, T5TokenizerFast,\n",
+ " get_linear_schedule_with_warmup\n",
+ ")\n",
+ "import torch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_name = \"ceshine/t5-paraphrase-paws-msrp-opinosis\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tokenizer = T5TokenizerFast.from_pretrained(model_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "223523 300\n"
+ ]
+ }
+ ],
+ "source": [
+ "df_train, df_test = train_test_split(xydf, test_size=300)\n",
+ "print(df_train.shape[0], df_test.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 56.1 s, sys: 1.78 s, total: 57.9 s\n",
+ "Wall time: 8.06 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "x1 = tokenizer(df_train.source.tolist(), truncation=True)\n",
+ "y1 = tokenizer(df_train.target.tolist(), truncation=True)\n",
+ "x2 = tokenizer(df_test.source.tolist(), truncation=True)\n",
+ "y2 = tokenizer(df_test.target.tolist(), truncation=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(223523, 300)"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "class PairsDataset(torch.utils.data.Dataset):\n",
+ " def __init__(self, x, y):\n",
+ " self.x = x\n",
+ " self.y = y\n",
+ "\n",
+ " def __getitem__(self, idx):\n",
+ " assert idx < len(self.x['input_ids'])\n",
+ " item = {key: val[idx] for key, val in self.x.items()}\n",
+ " item['decoder_attention_mask'] = self.y['attention_mask'][idx]\n",
+ " item['labels'] = self.y['input_ids'][idx]\n",
+ " return item\n",
+ " \n",
+ " @property\n",
+ " def n(self):\n",
+ " return len(self.x['input_ids'])\n",
+ "\n",
+ " def __len__(self):\n",
+ " return self.n # * 2\n",
+ " \n",
+ "train_dataset = PairsDataset(x1, y1)\n",
+ "test_dataset = PairsDataset(x2, y2)\n",
+ "len(train_dataset), len(test_dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from torch.utils.data import Dataset, DataLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_dataloader = DataLoader(train_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)\n",
+ "test_dataloader = DataLoader(test_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Fine tune t5"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import (\n",
+ " AdamW,\n",
+ " T5ForConditionalGeneration,\n",
+ " T5Tokenizer,\n",
+ " get_linear_schedule_with_warmup\n",
+ ")\n",
+ "import torch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "checkpoint_name = 'SkolkovoInstitute/t5-paraphrase-paws-msrp-opinosis-paranmt'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = T5ForConditionalGeneration.from_pretrained(checkpoint_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "device = torch.device('cuda:0')\n",
+ "model.to(device);"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import transformers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import Trainer, TrainingArguments\n",
+ "from transformers.file_utils import cached_property\n",
+ "from typing import Tuple\n",
+ "\n",
+ "class TrAr(TrainingArguments):\n",
+ " @cached_property\n",
+ " def _setup_devices(self):\n",
+ " return device"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from typing import List, Dict, Union\n",
+ "\n",
+ "class DataCollatorWithPadding:\n",
+ " def __init__(self, tokenizer):\n",
+ " self.tokenizer = tokenizer\n",
+ "\n",
+ " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n",
+ " batch = self.tokenizer.pad(\n",
+ " features,\n",
+ " padding=True,\n",
+ " )\n",
+ " ybatch = self.tokenizer.pad(\n",
+ " {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},\n",
+ " padding=True,\n",
+ " ) \n",
+ " batch['labels'] = ybatch['input_ids']\n",
+ " batch['decoder_attention_mask'] = ybatch['attention_mask']\n",
+ " \n",
+ " return {k: torch.tensor(v) for k, v in batch.items()}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "save_name = 'models/t5-cechine-nmt-mined-detox'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "todo: maybe, batch > 4 would do as well"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training_args = TrAr(\n",
+ " output_dir=save_name, # output directory\n",
+ " overwrite_output_dir=True,\n",
+ " num_train_epochs=3, # total # of training epochs\n",
+ " per_device_train_batch_size=4, # batch size per device during training\n",
+ " gradient_accumulation_steps=4,\n",
+ " per_device_eval_batch_size=8, # batch size for evaluation\n",
+ " warmup_steps=300, # number of warmup steps for learning rate scheduler\n",
+ " weight_decay=0, # strength of weight decay\n",
+ " learning_rate=3e-5,\n",
+ " logging_dir='./logs', # directory for storing logs\n",
+ " logging_steps=100,\n",
+ " eval_steps=100,\n",
+ " evaluation_strategy='steps',\n",
+ " save_total_limit=1,\n",
+ " save_steps=5000,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "trainer = Trainer(\n",
+ " model=model, # the instantiated 🤗 Transformers model to be trained\n",
+ " args=training_args, # training arguments, defined above\n",
+ " train_dataset=train_dataset, # training dataset\n",
+ " eval_dataset=test_dataset, # evaluation dataset\n",
+ " data_collator=data_collator,\n",
+ " tokenizer=tokenizer,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import gc\n",
+ "gc.collect()\n",
+ "torch.cuda.empty_cache();"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ " [21330/41910 2:15:15 < 2:10:30, 2.63 it/s, Epoch 1.53/3]\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Step | \n",
+ " Training Loss | \n",
+ " Validation Loss | \n",
+ " Runtime | \n",
+ " Samples Per Second | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 100 | \n",
+ " 1.084600 | \n",
+ " 0.793464 | \n",
+ " 1.229200 | \n",
+ " 244.053000 | \n",
+ "
\n",
+ " \n",
+ " 200 | \n",
+ " 1.090200 | \n",
+ " 0.788172 | \n",
+ " 1.226800 | \n",
+ " 244.531000 | \n",
+ "
\n",
+ " \n",
+ " 300 | \n",
+ " 1.109400 | \n",
+ " 0.785306 | \n",
+ " 1.238100 | \n",
+ " 242.309000 | \n",
+ "
\n",
+ " \n",
+ " 400 | \n",
+ " 1.077300 | \n",
+ " 0.784683 | \n",
+ " 1.217100 | \n",
+ " 246.494000 | \n",
+ "
\n",
+ " \n",
+ " 500 | \n",
+ " 1.089100 | \n",
+ " 0.783029 | \n",
+ " 1.259200 | \n",
+ " 238.245000 | \n",
+ "
\n",
+ " \n",
+ " 600 | \n",
+ " 1.077200 | \n",
+ " 0.781447 | \n",
+ " 1.260200 | \n",
+ " 238.060000 | \n",
+ "
\n",
+ " \n",
+ " 700 | \n",
+ " 1.076900 | \n",
+ " 0.779660 | \n",
+ " 1.212500 | \n",
+ " 247.413000 | \n",
+ "
\n",
+ " \n",
+ " 800 | \n",
+ " 1.057900 | \n",
+ " 0.778833 | \n",
+ " 1.222500 | \n",
+ " 245.389000 | \n",
+ "
\n",
+ " \n",
+ " 900 | \n",
+ " 1.078100 | \n",
+ " 0.778808 | \n",
+ " 1.223800 | \n",
+ " 245.132000 | \n",
+ "
\n",
+ " \n",
+ " 1000 | \n",
+ " 1.067300 | \n",
+ " 0.778335 | \n",
+ " 1.223300 | \n",
+ " 245.228000 | \n",
+ "
\n",
+ " \n",
+ " 1100 | \n",
+ " 1.051000 | \n",
+ " 0.778253 | \n",
+ " 1.245100 | \n",
+ " 240.952000 | \n",
+ "
\n",
+ " \n",
+ " 1200 | \n",
+ " 1.042900 | \n",
+ " 0.775767 | \n",
+ " 1.266100 | \n",
+ " 236.957000 | \n",
+ "
\n",
+ " \n",
+ " 1300 | \n",
+ " 1.070000 | \n",
+ " 0.775742 | \n",
+ " 1.251100 | \n",
+ " 239.797000 | \n",
+ "
\n",
+ " \n",
+ " 1400 | \n",
+ " 1.074900 | \n",
+ " 0.774952 | \n",
+ " 1.269100 | \n",
+ " 236.395000 | \n",
+ "
\n",
+ " \n",
+ " 1500 | \n",
+ " 1.053900 | \n",
+ " 0.775381 | \n",
+ " 1.251400 | \n",
+ " 239.725000 | \n",
+ "
\n",
+ " \n",
+ " 1600 | \n",
+ " 1.081500 | \n",
+ " 0.771725 | \n",
+ " 1.213400 | \n",
+ " 247.232000 | \n",
+ "
\n",
+ " \n",
+ " 1700 | \n",
+ " 1.080900 | \n",
+ " 0.771665 | \n",
+ " 1.220500 | \n",
+ " 245.809000 | \n",
+ "
\n",
+ " \n",
+ " 1800 | \n",
+ " 1.075500 | \n",
+ " 0.771367 | \n",
+ " 1.230700 | \n",
+ " 243.770000 | \n",
+ "
\n",
+ " \n",
+ " 1900 | \n",
+ " 1.075400 | \n",
+ " 0.770659 | \n",
+ " 1.214500 | \n",
+ " 247.022000 | \n",
+ "
\n",
+ " \n",
+ " 2000 | \n",
+ " 1.039600 | \n",
+ " 0.770911 | \n",
+ " 1.227600 | \n",
+ " 244.382000 | \n",
+ "
\n",
+ " \n",
+ " 2100 | \n",
+ " 1.043300 | \n",
+ " 0.771129 | \n",
+ " 1.211900 | \n",
+ " 247.553000 | \n",
+ "
\n",
+ " \n",
+ " 2200 | \n",
+ " 1.042900 | \n",
+ " 0.769076 | \n",
+ " 1.247500 | \n",
+ " 240.490000 | \n",
+ "
\n",
+ " \n",
+ " 2300 | \n",
+ " 1.062000 | \n",
+ " 0.769138 | \n",
+ " 1.266100 | \n",
+ " 236.940000 | \n",
+ "
\n",
+ " \n",
+ " 2400 | \n",
+ " 1.085700 | \n",
+ " 0.768502 | \n",
+ " 1.237500 | \n",
+ " 242.424000 | \n",
+ "
\n",
+ " \n",
+ " 2500 | \n",
+ " 1.053600 | \n",
+ " 0.767439 | \n",
+ " 1.266000 | \n",
+ " 236.970000 | \n",
+ "
\n",
+ " \n",
+ " 2600 | \n",
+ " 1.029700 | \n",
+ " 0.767492 | \n",
+ " 1.242600 | \n",
+ " 241.429000 | \n",
+ "
\n",
+ " \n",
+ " 2700 | \n",
+ " 1.049200 | \n",
+ " 0.766390 | \n",
+ " 1.209600 | \n",
+ " 248.013000 | \n",
+ "
\n",
+ " \n",
+ " 2800 | \n",
+ " 1.046600 | \n",
+ " 0.766780 | \n",
+ " 1.218400 | \n",
+ " 246.229000 | \n",
+ "
\n",
+ " \n",
+ " 2900 | \n",
+ " 1.055300 | \n",
+ " 0.767001 | \n",
+ " 1.282000 | \n",
+ " 234.001000 | \n",
+ "
\n",
+ " \n",
+ " 3000 | \n",
+ " 1.073500 | \n",
+ " 0.764760 | \n",
+ " 1.257000 | \n",
+ " 238.671000 | \n",
+ "
\n",
+ " \n",
+ " 3100 | \n",
+ " 1.063700 | \n",
+ " 0.765447 | \n",
+ " 1.254900 | \n",
+ " 239.054000 | \n",
+ "
\n",
+ " \n",
+ " 3200 | \n",
+ " 1.053200 | \n",
+ " 0.764491 | \n",
+ " 1.255600 | \n",
+ " 238.930000 | \n",
+ "
\n",
+ " \n",
+ " 3300 | \n",
+ " 1.025200 | \n",
+ " 0.764375 | \n",
+ " 1.249100 | \n",
+ " 240.169000 | \n",
+ "
\n",
+ " \n",
+ " 3400 | \n",
+ " 1.083700 | \n",
+ " 0.764105 | \n",
+ " 1.247100 | \n",
+ " 240.566000 | \n",
+ "
\n",
+ " \n",
+ " 3500 | \n",
+ " 1.030100 | \n",
+ " 0.763678 | \n",
+ " 1.255300 | \n",
+ " 238.990000 | \n",
+ "
\n",
+ " \n",
+ " 3600 | \n",
+ " 1.065100 | \n",
+ " 0.763230 | \n",
+ " 1.243700 | \n",
+ " 241.221000 | \n",
+ "
\n",
+ " \n",
+ " 3700 | \n",
+ " 1.064300 | \n",
+ " 0.762037 | \n",
+ " 1.205700 | \n",
+ " 248.814000 | \n",
+ "
\n",
+ " \n",
+ " 3800 | \n",
+ " 1.032000 | \n",
+ " 0.762514 | \n",
+ " 1.218300 | \n",
+ " 246.241000 | \n",
+ "
\n",
+ " \n",
+ " 3900 | \n",
+ " 1.037300 | \n",
+ " 0.761514 | \n",
+ " 1.208400 | \n",
+ " 248.254000 | \n",
+ "
\n",
+ " \n",
+ " 4000 | \n",
+ " 1.055200 | \n",
+ " 0.761335 | \n",
+ " 1.217000 | \n",
+ " 246.503000 | \n",
+ "
\n",
+ " \n",
+ " 4100 | \n",
+ " 1.064000 | \n",
+ " 0.761690 | \n",
+ " 1.207100 | \n",
+ " 248.526000 | \n",
+ "
\n",
+ " \n",
+ " 4200 | \n",
+ " 1.066300 | \n",
+ " 0.762006 | \n",
+ " 1.218900 | \n",
+ " 246.125000 | \n",
+ "
\n",
+ " \n",
+ " 4300 | \n",
+ " 1.061100 | \n",
+ " 0.760712 | \n",
+ " 1.214600 | \n",
+ " 247.001000 | \n",
+ "
\n",
+ " \n",
+ " 4400 | \n",
+ " 1.058400 | \n",
+ " 0.761347 | \n",
+ " 1.208400 | \n",
+ " 248.260000 | \n",
+ "
\n",
+ " \n",
+ " 4500 | \n",
+ " 1.048500 | \n",
+ " 0.760592 | \n",
+ " 1.213300 | \n",
+ " 247.252000 | \n",
+ "
\n",
+ " \n",
+ " 4600 | \n",
+ " 1.035200 | \n",
+ " 0.759609 | \n",
+ " 1.215600 | \n",
+ " 246.783000 | \n",
+ "
\n",
+ " \n",
+ " 4700 | \n",
+ " 1.060900 | \n",
+ " 0.759173 | \n",
+ " 1.218700 | \n",
+ " 246.171000 | \n",
+ "
\n",
+ " \n",
+ " 4800 | \n",
+ " 1.052000 | \n",
+ " 0.759094 | \n",
+ " 1.257100 | \n",
+ " 238.635000 | \n",
+ "
\n",
+ " \n",
+ " 4900 | \n",
+ " 1.052600 | \n",
+ " 0.759395 | \n",
+ " 1.232700 | \n",
+ " 243.361000 | \n",
+ "
\n",
+ " \n",
+ " 5000 | \n",
+ " 1.036300 | \n",
+ " 0.759448 | \n",
+ " 1.252600 | \n",
+ " 239.511000 | \n",
+ "
\n",
+ " \n",
+ " 5100 | \n",
+ " 1.082100 | \n",
+ " 0.759520 | \n",
+ " 1.250100 | \n",
+ " 239.990000 | \n",
+ "
\n",
+ " \n",
+ " 5200 | \n",
+ " 1.025200 | \n",
+ " 0.758338 | \n",
+ " 1.208200 | \n",
+ " 248.307000 | \n",
+ "
\n",
+ " \n",
+ " 5300 | \n",
+ " 1.076900 | \n",
+ " 0.757079 | \n",
+ " 1.211300 | \n",
+ " 247.668000 | \n",
+ "
\n",
+ " \n",
+ " 5400 | \n",
+ " 1.071000 | \n",
+ " 0.756892 | \n",
+ " 1.216700 | \n",
+ " 246.574000 | \n",
+ "
\n",
+ " \n",
+ " 5500 | \n",
+ " 1.042500 | \n",
+ " 0.756589 | \n",
+ " 1.208200 | \n",
+ " 248.309000 | \n",
+ "
\n",
+ " \n",
+ " 5600 | \n",
+ " 1.078800 | \n",
+ " 0.756307 | \n",
+ " 1.246000 | \n",
+ " 240.778000 | \n",
+ "
\n",
+ " \n",
+ " 5700 | \n",
+ " 1.039700 | \n",
+ " 0.756938 | \n",
+ " 1.216500 | \n",
+ " 246.619000 | \n",
+ "
\n",
+ " \n",
+ " 5800 | \n",
+ " 1.037400 | \n",
+ " 0.756166 | \n",
+ " 1.211700 | \n",
+ " 247.588000 | \n",
+ "
\n",
+ " \n",
+ " 5900 | \n",
+ " 1.078400 | \n",
+ " 0.756119 | \n",
+ " 1.282300 | \n",
+ " 233.960000 | \n",
+ "
\n",
+ " \n",
+ " 6000 | \n",
+ " 1.060600 | \n",
+ " 0.755657 | \n",
+ " 1.263600 | \n",
+ " 237.423000 | \n",
+ "
\n",
+ " \n",
+ " 6100 | \n",
+ " 1.037800 | \n",
+ " 0.755987 | \n",
+ " 1.231000 | \n",
+ " 243.706000 | \n",
+ "
\n",
+ " \n",
+ " 6200 | \n",
+ " 1.048200 | \n",
+ " 0.756291 | \n",
+ " 1.255300 | \n",
+ " 238.994000 | \n",
+ "
\n",
+ " \n",
+ " 6300 | \n",
+ " 1.053100 | \n",
+ " 0.755330 | \n",
+ " 1.275900 | \n",
+ " 235.130000 | \n",
+ "
\n",
+ " \n",
+ " 6400 | \n",
+ " 1.029800 | \n",
+ " 0.755506 | \n",
+ " 1.272900 | \n",
+ " 235.685000 | \n",
+ "
\n",
+ " \n",
+ " 6500 | \n",
+ " 1.057200 | \n",
+ " 0.755173 | \n",
+ " 1.268600 | \n",
+ " 236.478000 | \n",
+ "
\n",
+ " \n",
+ " 6600 | \n",
+ " 1.062400 | \n",
+ " 0.754754 | \n",
+ " 1.238500 | \n",
+ " 242.228000 | \n",
+ "
\n",
+ " \n",
+ " 6700 | \n",
+ " 1.038600 | \n",
+ " 0.755378 | \n",
+ " 1.238600 | \n",
+ " 242.204000 | \n",
+ "
\n",
+ " \n",
+ " 6800 | \n",
+ " 1.048100 | \n",
+ " 0.754559 | \n",
+ " 1.237700 | \n",
+ " 242.392000 | \n",
+ "
\n",
+ " \n",
+ " 6900 | \n",
+ " 1.020800 | \n",
+ " 0.753233 | \n",
+ " 1.266900 | \n",
+ " 236.799000 | \n",
+ "
\n",
+ " \n",
+ " 7000 | \n",
+ " 1.010500 | \n",
+ " 0.752919 | \n",
+ " 1.259100 | \n",
+ " 238.258000 | \n",
+ "
\n",
+ " \n",
+ " 7100 | \n",
+ " 1.018300 | \n",
+ " 0.753768 | \n",
+ " 1.230400 | \n",
+ " 243.815000 | \n",
+ "
\n",
+ " \n",
+ " 7200 | \n",
+ " 1.055100 | \n",
+ " 0.752936 | \n",
+ " 1.228700 | \n",
+ " 244.156000 | \n",
+ "
\n",
+ " \n",
+ " 7300 | \n",
+ " 1.031800 | \n",
+ " 0.753859 | \n",
+ " 1.227300 | \n",
+ " 244.444000 | \n",
+ "
\n",
+ " \n",
+ " 7400 | \n",
+ " 1.023900 | \n",
+ " 0.753472 | \n",
+ " 1.231600 | \n",
+ " 243.594000 | \n",
+ "
\n",
+ " \n",
+ " 7500 | \n",
+ " 1.048800 | \n",
+ " 0.753128 | \n",
+ " 1.232500 | \n",
+ " 243.413000 | \n",
+ "
\n",
+ " \n",
+ " 7600 | \n",
+ " 1.050900 | \n",
+ " 0.753191 | \n",
+ " 1.235600 | \n",
+ " 242.798000 | \n",
+ "
\n",
+ " \n",
+ " 7700 | \n",
+ " 1.046300 | \n",
+ " 0.752935 | \n",
+ " 1.231200 | \n",
+ " 243.658000 | \n",
+ "
\n",
+ " \n",
+ " 7800 | \n",
+ " 1.055300 | \n",
+ " 0.752511 | \n",
+ " 1.234200 | \n",
+ " 243.071000 | \n",
+ "
\n",
+ " \n",
+ " 7900 | \n",
+ " 1.044100 | \n",
+ " 0.753403 | \n",
+ " 1.239000 | \n",
+ " 242.138000 | \n",
+ "
\n",
+ " \n",
+ " 8000 | \n",
+ " 1.040900 | \n",
+ " 0.752865 | \n",
+ " 1.235300 | \n",
+ " 242.858000 | \n",
+ "
\n",
+ " \n",
+ " 8100 | \n",
+ " 1.038500 | \n",
+ " 0.752217 | \n",
+ " 1.237500 | \n",
+ " 242.424000 | \n",
+ "
\n",
+ " \n",
+ " 8200 | \n",
+ " 1.011900 | \n",
+ " 0.752405 | \n",
+ " 1.238100 | \n",
+ " 242.300000 | \n",
+ "
\n",
+ " \n",
+ " 8300 | \n",
+ " 1.033400 | \n",
+ " 0.752253 | \n",
+ " 1.279700 | \n",
+ " 234.428000 | \n",
+ "
\n",
+ " \n",
+ " 8400 | \n",
+ " 1.034600 | \n",
+ " 0.752704 | \n",
+ " 1.244700 | \n",
+ " 241.014000 | \n",
+ "
\n",
+ " \n",
+ " 8500 | \n",
+ " 1.025600 | \n",
+ " 0.752743 | \n",
+ " 1.241000 | \n",
+ " 241.735000 | \n",
+ "
\n",
+ " \n",
+ " 8600 | \n",
+ " 1.030000 | \n",
+ " 0.752871 | \n",
+ " 1.243200 | \n",
+ " 241.309000 | \n",
+ "
\n",
+ " \n",
+ " 8700 | \n",
+ " 1.025000 | \n",
+ " 0.751974 | \n",
+ " 1.248200 | \n",
+ " 240.352000 | \n",
+ "
\n",
+ " \n",
+ " 8800 | \n",
+ " 1.026400 | \n",
+ " 0.750923 | \n",
+ " 1.242600 | \n",
+ " 241.424000 | \n",
+ "
\n",
+ " \n",
+ " 8900 | \n",
+ " 1.035900 | \n",
+ " 0.751470 | \n",
+ " 1.241100 | \n",
+ " 241.724000 | \n",
+ "
\n",
+ " \n",
+ " 9000 | \n",
+ " 1.073900 | \n",
+ " 0.750855 | \n",
+ " 1.242900 | \n",
+ " 241.375000 | \n",
+ "
\n",
+ " \n",
+ " 9100 | \n",
+ " 1.057800 | \n",
+ " 0.750324 | \n",
+ " 1.243500 | \n",
+ " 241.258000 | \n",
+ "
\n",
+ " \n",
+ " 9200 | \n",
+ " 1.071400 | \n",
+ " 0.749522 | \n",
+ " 1.242900 | \n",
+ " 241.378000 | \n",
+ "
\n",
+ " \n",
+ " 9300 | \n",
+ " 1.008700 | \n",
+ " 0.749787 | \n",
+ " 1.243900 | \n",
+ " 241.183000 | \n",
+ "
\n",
+ " \n",
+ " 9400 | \n",
+ " 1.019000 | \n",
+ " 0.750012 | \n",
+ " 1.246600 | \n",
+ " 240.653000 | \n",
+ "
\n",
+ " \n",
+ " 9500 | \n",
+ " 1.026500 | \n",
+ " 0.749811 | \n",
+ " 1.272300 | \n",
+ " 235.790000 | \n",
+ "
\n",
+ " \n",
+ " 9600 | \n",
+ " 1.012300 | \n",
+ " 0.749915 | \n",
+ " 1.254800 | \n",
+ " 239.078000 | \n",
+ "
\n",
+ " \n",
+ " 9700 | \n",
+ " 1.050700 | \n",
+ " 0.748971 | \n",
+ " 1.286300 | \n",
+ " 233.222000 | \n",
+ "
\n",
+ " \n",
+ " 9800 | \n",
+ " 1.028100 | \n",
+ " 0.748931 | \n",
+ " 1.266300 | \n",
+ " 236.904000 | \n",
+ "
\n",
+ " \n",
+ " 9900 | \n",
+ " 1.037500 | \n",
+ " 0.749348 | \n",
+ " 1.250800 | \n",
+ " 239.841000 | \n",
+ "
\n",
+ " \n",
+ " 10000 | \n",
+ " 1.016300 | \n",
+ " 0.749185 | \n",
+ " 1.284500 | \n",
+ " 233.558000 | \n",
+ "
\n",
+ " \n",
+ " 10100 | \n",
+ " 1.004500 | \n",
+ " 0.749256 | \n",
+ " 1.281500 | \n",
+ " 234.098000 | \n",
+ "
\n",
+ " \n",
+ " 10200 | \n",
+ " 1.016600 | \n",
+ " 0.749098 | \n",
+ " 1.250200 | \n",
+ " 239.955000 | \n",
+ "
\n",
+ " \n",
+ " 10300 | \n",
+ " 1.021200 | \n",
+ " 0.747827 | \n",
+ " 1.246300 | \n",
+ " 240.721000 | \n",
+ "
\n",
+ " \n",
+ " 10400 | \n",
+ " 1.021400 | \n",
+ " 0.748277 | \n",
+ " 1.280900 | \n",
+ " 234.210000 | \n",
+ "
\n",
+ " \n",
+ " 10500 | \n",
+ " 1.009000 | \n",
+ " 0.748932 | \n",
+ " 1.258700 | \n",
+ " 238.340000 | \n",
+ "
\n",
+ " \n",
+ " 10600 | \n",
+ " 1.038400 | \n",
+ " 0.748908 | \n",
+ " 1.262000 | \n",
+ " 237.710000 | \n",
+ "
\n",
+ " \n",
+ " 10700 | \n",
+ " 1.006300 | \n",
+ " 0.748689 | \n",
+ " 1.248100 | \n",
+ " 240.369000 | \n",
+ "
\n",
+ " \n",
+ " 10800 | \n",
+ " 1.000600 | \n",
+ " 0.748598 | \n",
+ " 1.272400 | \n",
+ " 235.779000 | \n",
+ "
\n",
+ " \n",
+ " 10900 | \n",
+ " 1.020400 | \n",
+ " 0.748684 | \n",
+ " 1.267700 | \n",
+ " 236.654000 | \n",
+ "
\n",
+ " \n",
+ " 11000 | \n",
+ " 1.027100 | \n",
+ " 0.747368 | \n",
+ " 1.257500 | \n",
+ " 238.576000 | \n",
+ "
\n",
+ " \n",
+ " 11100 | \n",
+ " 1.019900 | \n",
+ " 0.747749 | \n",
+ " 1.279100 | \n",
+ " 234.534000 | \n",
+ "
\n",
+ " \n",
+ " 11200 | \n",
+ " 1.043900 | \n",
+ " 0.747230 | \n",
+ " 1.257400 | \n",
+ " 238.587000 | \n",
+ "
\n",
+ " \n",
+ " 11300 | \n",
+ " 1.047800 | \n",
+ " 0.747071 | \n",
+ " 1.262600 | \n",
+ " 237.613000 | \n",
+ "
\n",
+ " \n",
+ " 11400 | \n",
+ " 1.063600 | \n",
+ " 0.746920 | \n",
+ " 1.266600 | \n",
+ " 236.851000 | \n",
+ "
\n",
+ " \n",
+ " 11500 | \n",
+ " 1.016600 | \n",
+ " 0.747175 | \n",
+ " 1.275900 | \n",
+ " 235.128000 | \n",
+ "
\n",
+ " \n",
+ " 11600 | \n",
+ " 1.040000 | \n",
+ " 0.746895 | \n",
+ " 1.243800 | \n",
+ " 241.200000 | \n",
+ "
\n",
+ " \n",
+ " 11700 | \n",
+ " 1.028600 | \n",
+ " 0.746685 | \n",
+ " 1.240900 | \n",
+ " 241.760000 | \n",
+ "
\n",
+ " \n",
+ " 11800 | \n",
+ " 1.014600 | \n",
+ " 0.746829 | \n",
+ " 1.481100 | \n",
+ " 202.547000 | \n",
+ "
\n",
+ " \n",
+ " 11900 | \n",
+ " 1.017500 | \n",
+ " 0.747098 | \n",
+ " 1.591900 | \n",
+ " 188.452000 | \n",
+ "
\n",
+ " \n",
+ " 12000 | \n",
+ " 0.999900 | \n",
+ " 0.745886 | \n",
+ " 1.469700 | \n",
+ " 204.119000 | \n",
+ "
\n",
+ " \n",
+ " 12100 | \n",
+ " 1.030400 | \n",
+ " 0.745721 | \n",
+ " 1.436700 | \n",
+ " 208.816000 | \n",
+ "
\n",
+ " \n",
+ " 12200 | \n",
+ " 1.039600 | \n",
+ " 0.745957 | \n",
+ " 1.454700 | \n",
+ " 206.223000 | \n",
+ "
\n",
+ " \n",
+ " 12300 | \n",
+ " 1.023200 | \n",
+ " 0.745678 | \n",
+ " 1.836300 | \n",
+ " 163.370000 | \n",
+ "
\n",
+ " \n",
+ " 12400 | \n",
+ " 1.014600 | \n",
+ " 0.745402 | \n",
+ " 1.294600 | \n",
+ " 231.729000 | \n",
+ "
\n",
+ " \n",
+ " 12500 | \n",
+ " 1.033500 | \n",
+ " 0.744856 | \n",
+ " 1.253300 | \n",
+ " 239.374000 | \n",
+ "
\n",
+ " \n",
+ " 12600 | \n",
+ " 1.032200 | \n",
+ " 0.745365 | \n",
+ " 1.547900 | \n",
+ " 193.816000 | \n",
+ "
\n",
+ " \n",
+ " 12700 | \n",
+ " 1.018700 | \n",
+ " 0.745155 | \n",
+ " 1.251700 | \n",
+ " 239.684000 | \n",
+ "
\n",
+ " \n",
+ " 12800 | \n",
+ " 1.016300 | \n",
+ " 0.744066 | \n",
+ " 1.243200 | \n",
+ " 241.310000 | \n",
+ "
\n",
+ " \n",
+ " 12900 | \n",
+ " 1.036900 | \n",
+ " 0.743684 | \n",
+ " 1.282300 | \n",
+ " 233.959000 | \n",
+ "
\n",
+ " \n",
+ " 13000 | \n",
+ " 1.003600 | \n",
+ " 0.744208 | \n",
+ " 1.598900 | \n",
+ " 187.630000 | \n",
+ "
\n",
+ " \n",
+ " 13100 | \n",
+ " 1.017600 | \n",
+ " 0.744630 | \n",
+ " 1.254700 | \n",
+ " 239.095000 | \n",
+ "
\n",
+ " \n",
+ " 13200 | \n",
+ " 1.020900 | \n",
+ " 0.745608 | \n",
+ " 1.240800 | \n",
+ " 241.770000 | \n",
+ "
\n",
+ " \n",
+ " 13300 | \n",
+ " 1.003900 | \n",
+ " 0.745024 | \n",
+ " 1.275700 | \n",
+ " 235.162000 | \n",
+ "
\n",
+ " \n",
+ " 13400 | \n",
+ " 1.044700 | \n",
+ " 0.744231 | \n",
+ " 1.253400 | \n",
+ " 239.349000 | \n",
+ "
\n",
+ " \n",
+ " 13500 | \n",
+ " 0.995200 | \n",
+ " 0.744273 | \n",
+ " 1.644500 | \n",
+ " 182.424000 | \n",
+ "
\n",
+ " \n",
+ " 13600 | \n",
+ " 1.019600 | \n",
+ " 0.743707 | \n",
+ " 1.245900 | \n",
+ " 240.798000 | \n",
+ "
\n",
+ " \n",
+ " 13700 | \n",
+ " 1.012200 | \n",
+ " 0.744101 | \n",
+ " 1.285800 | \n",
+ " 233.320000 | \n",
+ "
\n",
+ " \n",
+ " 13800 | \n",
+ " 1.002000 | \n",
+ " 0.743579 | \n",
+ " 1.267500 | \n",
+ " 236.678000 | \n",
+ "
\n",
+ " \n",
+ " 13900 | \n",
+ " 1.024200 | \n",
+ " 0.743122 | \n",
+ " 1.293900 | \n",
+ " 231.864000 | \n",
+ "
\n",
+ " \n",
+ " 14000 | \n",
+ " 1.014100 | \n",
+ " 0.744132 | \n",
+ " 1.252500 | \n",
+ " 239.522000 | \n",
+ "
\n",
+ " \n",
+ " 14100 | \n",
+ " 1.027900 | \n",
+ " 0.743575 | \n",
+ " 1.269800 | \n",
+ " 236.266000 | \n",
+ "
\n",
+ " \n",
+ " 14200 | \n",
+ " 0.993800 | \n",
+ " 0.743217 | \n",
+ " 1.269300 | \n",
+ " 236.345000 | \n",
+ "
\n",
+ " \n",
+ " 14300 | \n",
+ " 0.985600 | \n",
+ " 0.743278 | \n",
+ " 1.280700 | \n",
+ " 234.251000 | \n",
+ "
\n",
+ " \n",
+ " 14400 | \n",
+ " 0.989600 | \n",
+ " 0.743227 | \n",
+ " 1.255900 | \n",
+ " 238.880000 | \n",
+ "
\n",
+ " \n",
+ " 14500 | \n",
+ " 0.977600 | \n",
+ " 0.743479 | \n",
+ " 1.262200 | \n",
+ " 237.686000 | \n",
+ "
\n",
+ " \n",
+ " 14600 | \n",
+ " 1.000400 | \n",
+ " 0.742927 | \n",
+ " 1.259000 | \n",
+ " 238.282000 | \n",
+ "
\n",
+ " \n",
+ " 14700 | \n",
+ " 0.993400 | \n",
+ " 0.741708 | \n",
+ " 1.539800 | \n",
+ " 194.834000 | \n",
+ "
\n",
+ " \n",
+ " 14800 | \n",
+ " 0.993800 | \n",
+ " 0.741394 | \n",
+ " 1.532100 | \n",
+ " 195.812000 | \n",
+ "
\n",
+ " \n",
+ " 14900 | \n",
+ " 1.000100 | \n",
+ " 0.740445 | \n",
+ " 1.604900 | \n",
+ " 186.927000 | \n",
+ "
\n",
+ " \n",
+ " 15000 | \n",
+ " 1.020800 | \n",
+ " 0.740295 | \n",
+ " 1.490500 | \n",
+ " 201.274000 | \n",
+ "
\n",
+ " \n",
+ " 15100 | \n",
+ " 1.000800 | \n",
+ " 0.740918 | \n",
+ " 1.462000 | \n",
+ " 205.205000 | \n",
+ "
\n",
+ " \n",
+ " 15200 | \n",
+ " 0.975700 | \n",
+ " 0.740664 | \n",
+ " 1.277300 | \n",
+ " 234.879000 | \n",
+ "
\n",
+ " \n",
+ " 15300 | \n",
+ " 1.013500 | \n",
+ " 0.739949 | \n",
+ " 1.284700 | \n",
+ " 233.514000 | \n",
+ "
\n",
+ " \n",
+ " 15400 | \n",
+ " 1.007800 | \n",
+ " 0.739860 | \n",
+ " 1.244100 | \n",
+ " 241.136000 | \n",
+ "
\n",
+ " \n",
+ " 15500 | \n",
+ " 1.024600 | \n",
+ " 0.740061 | \n",
+ " 1.859200 | \n",
+ " 161.360000 | \n",
+ "
\n",
+ " \n",
+ " 15600 | \n",
+ " 1.015200 | \n",
+ " 0.740142 | \n",
+ " 1.471400 | \n",
+ " 203.889000 | \n",
+ "
\n",
+ " \n",
+ " 15700 | \n",
+ " 1.006800 | \n",
+ " 0.740317 | \n",
+ " 1.447000 | \n",
+ " 207.322000 | \n",
+ "
\n",
+ " \n",
+ " 15800 | \n",
+ " 1.007800 | \n",
+ " 0.740259 | \n",
+ " 1.432000 | \n",
+ " 209.492000 | \n",
+ "
\n",
+ " \n",
+ " 15900 | \n",
+ " 1.004500 | \n",
+ " 0.740475 | \n",
+ " 1.448100 | \n",
+ " 207.164000 | \n",
+ "
\n",
+ " \n",
+ " 16000 | \n",
+ " 0.971900 | \n",
+ " 0.740364 | \n",
+ " 1.352500 | \n",
+ " 221.805000 | \n",
+ "
\n",
+ " \n",
+ " 16100 | \n",
+ " 0.979900 | \n",
+ " 0.740424 | \n",
+ " 1.278300 | \n",
+ " 234.695000 | \n",
+ "
\n",
+ " \n",
+ " 16200 | \n",
+ " 0.990300 | \n",
+ " 0.740286 | \n",
+ " 1.528500 | \n",
+ " 196.277000 | \n",
+ "
\n",
+ " \n",
+ " 16300 | \n",
+ " 0.992700 | \n",
+ " 0.740709 | \n",
+ " 1.274100 | \n",
+ " 235.463000 | \n",
+ "
\n",
+ " \n",
+ " 16400 | \n",
+ " 1.000600 | \n",
+ " 0.740067 | \n",
+ " 1.264900 | \n",
+ " 237.169000 | \n",
+ "
\n",
+ " \n",
+ " 16500 | \n",
+ " 1.004000 | \n",
+ " 0.740554 | \n",
+ " 1.298000 | \n",
+ " 231.122000 | \n",
+ "
\n",
+ " \n",
+ " 16600 | \n",
+ " 0.987400 | \n",
+ " 0.739771 | \n",
+ " 1.275400 | \n",
+ " 235.219000 | \n",
+ "
\n",
+ " \n",
+ " 16700 | \n",
+ " 0.965700 | \n",
+ " 0.739481 | \n",
+ " 1.562800 | \n",
+ " 191.959000 | \n",
+ "
\n",
+ " \n",
+ " 16800 | \n",
+ " 0.999300 | \n",
+ " 0.739890 | \n",
+ " 1.303900 | \n",
+ " 230.078000 | \n",
+ "
\n",
+ " \n",
+ " 16900 | \n",
+ " 1.014200 | \n",
+ " 0.739523 | \n",
+ " 1.291700 | \n",
+ " 232.250000 | \n",
+ "
\n",
+ " \n",
+ " 17000 | \n",
+ " 0.981600 | \n",
+ " 0.740319 | \n",
+ " 1.315600 | \n",
+ " 228.028000 | \n",
+ "
\n",
+ " \n",
+ " 17100 | \n",
+ " 0.990400 | \n",
+ " 0.739653 | \n",
+ " 1.604000 | \n",
+ " 187.034000 | \n",
+ "
\n",
+ " \n",
+ " 17200 | \n",
+ " 0.990600 | \n",
+ " 0.739893 | \n",
+ " 1.340100 | \n",
+ " 223.861000 | \n",
+ "
\n",
+ " \n",
+ " 17300 | \n",
+ " 1.001500 | \n",
+ " 0.739922 | \n",
+ " 1.258800 | \n",
+ " 238.324000 | \n",
+ "
\n",
+ " \n",
+ " 17400 | \n",
+ " 1.016000 | \n",
+ " 0.739180 | \n",
+ " 1.268600 | \n",
+ " 236.475000 | \n",
+ "
\n",
+ " \n",
+ " 17500 | \n",
+ " 1.013600 | \n",
+ " 0.739608 | \n",
+ " 1.248700 | \n",
+ " 240.248000 | \n",
+ "
\n",
+ " \n",
+ " 17600 | \n",
+ " 0.984000 | \n",
+ " 0.739738 | \n",
+ " 1.251400 | \n",
+ " 239.724000 | \n",
+ "
\n",
+ " \n",
+ " 17700 | \n",
+ " 0.976100 | \n",
+ " 0.739444 | \n",
+ " 1.252400 | \n",
+ " 239.538000 | \n",
+ "
\n",
+ " \n",
+ " 17800 | \n",
+ " 0.982300 | \n",
+ " 0.738769 | \n",
+ " 1.249200 | \n",
+ " 240.153000 | \n",
+ "
\n",
+ " \n",
+ " 17900 | \n",
+ " 0.989300 | \n",
+ " 0.738793 | \n",
+ " 1.551400 | \n",
+ " 193.368000 | \n",
+ "
\n",
+ " \n",
+ " 18000 | \n",
+ " 1.012300 | \n",
+ " 0.738799 | \n",
+ " 1.537700 | \n",
+ " 195.094000 | \n",
+ "
\n",
+ " \n",
+ " 18100 | \n",
+ " 1.038100 | \n",
+ " 0.739148 | \n",
+ " 1.251200 | \n",
+ " 239.778000 | \n",
+ "
\n",
+ " \n",
+ " 18200 | \n",
+ " 1.010000 | \n",
+ " 0.739396 | \n",
+ " 1.263800 | \n",
+ " 237.373000 | \n",
+ "
\n",
+ " \n",
+ " 18300 | \n",
+ " 0.993900 | \n",
+ " 0.740252 | \n",
+ " 1.249700 | \n",
+ " 240.052000 | \n",
+ "
\n",
+ " \n",
+ " 18400 | \n",
+ " 0.978400 | \n",
+ " 0.739943 | \n",
+ " 1.265800 | \n",
+ " 237.003000 | \n",
+ "
\n",
+ " \n",
+ " 18500 | \n",
+ " 0.998600 | \n",
+ " 0.739694 | \n",
+ " 1.267100 | \n",
+ " 236.755000 | \n",
+ "
\n",
+ " \n",
+ " 18600 | \n",
+ " 1.012900 | \n",
+ " 0.738725 | \n",
+ " 1.342900 | \n",
+ " 223.400000 | \n",
+ "
\n",
+ " \n",
+ " 18700 | \n",
+ " 0.991900 | \n",
+ " 0.738619 | \n",
+ " 1.290600 | \n",
+ " 232.456000 | \n",
+ "
\n",
+ " \n",
+ " 18800 | \n",
+ " 1.018000 | \n",
+ " 0.738426 | \n",
+ " 1.515000 | \n",
+ " 198.026000 | \n",
+ "
\n",
+ " \n",
+ " 18900 | \n",
+ " 0.982600 | \n",
+ " 0.738811 | \n",
+ " 1.494000 | \n",
+ " 200.808000 | \n",
+ "
\n",
+ " \n",
+ " 19000 | \n",
+ " 0.954200 | \n",
+ " 0.738561 | \n",
+ " 1.508100 | \n",
+ " 198.922000 | \n",
+ "
\n",
+ " \n",
+ " 19100 | \n",
+ " 1.018100 | \n",
+ " 0.739036 | \n",
+ " 1.306000 | \n",
+ " 229.707000 | \n",
+ "
\n",
+ " \n",
+ " 19200 | \n",
+ " 1.007800 | \n",
+ " 0.738729 | \n",
+ " 1.310700 | \n",
+ " 228.877000 | \n",
+ "
\n",
+ " \n",
+ " 19300 | \n",
+ " 0.988500 | \n",
+ " 0.738832 | \n",
+ " 1.288000 | \n",
+ " 232.916000 | \n",
+ "
\n",
+ " \n",
+ " 19400 | \n",
+ " 0.989300 | \n",
+ " 0.738646 | \n",
+ " 1.293900 | \n",
+ " 231.858000 | \n",
+ "
\n",
+ " \n",
+ " 19500 | \n",
+ " 0.989900 | \n",
+ " 0.738120 | \n",
+ " 1.289200 | \n",
+ " 232.704000 | \n",
+ "
\n",
+ " \n",
+ " 19600 | \n",
+ " 1.004500 | \n",
+ " 0.738041 | \n",
+ " 1.296600 | \n",
+ " 231.383000 | \n",
+ "
\n",
+ " \n",
+ " 19700 | \n",
+ " 0.973500 | \n",
+ " 0.738274 | \n",
+ " 1.293300 | \n",
+ " 231.959000 | \n",
+ "
\n",
+ " \n",
+ " 19800 | \n",
+ " 1.021500 | \n",
+ " 0.738264 | \n",
+ " 1.288700 | \n",
+ " 232.792000 | \n",
+ "
\n",
+ " \n",
+ " 19900 | \n",
+ " 1.020100 | \n",
+ " 0.738159 | \n",
+ " 1.287900 | \n",
+ " 232.932000 | \n",
+ "
\n",
+ " \n",
+ " 20000 | \n",
+ " 0.988100 | \n",
+ " 0.738830 | \n",
+ " 1.288400 | \n",
+ " 232.845000 | \n",
+ "
\n",
+ " \n",
+ " 20100 | \n",
+ " 0.988900 | \n",
+ " 0.738999 | \n",
+ " 1.351100 | \n",
+ " 222.044000 | \n",
+ "
\n",
+ " \n",
+ " 20200 | \n",
+ " 0.946700 | \n",
+ " 0.739130 | \n",
+ " 1.295800 | \n",
+ " 231.523000 | \n",
+ "
\n",
+ " \n",
+ " 20300 | \n",
+ " 0.985200 | \n",
+ " 0.738304 | \n",
+ " 1.295200 | \n",
+ " 231.627000 | \n",
+ "
\n",
+ " \n",
+ " 20400 | \n",
+ " 0.976200 | \n",
+ " 0.738922 | \n",
+ " 1.293200 | \n",
+ " 231.979000 | \n",
+ "
\n",
+ " \n",
+ " 20500 | \n",
+ " 0.972600 | \n",
+ " 0.738379 | \n",
+ " 1.292200 | \n",
+ " 232.153000 | \n",
+ "
\n",
+ " \n",
+ " 20600 | \n",
+ " 0.977300 | \n",
+ " 0.738292 | \n",
+ " 1.293500 | \n",
+ " 231.922000 | \n",
+ "
\n",
+ " \n",
+ " 20700 | \n",
+ " 1.014100 | \n",
+ " 0.738517 | \n",
+ " 1.293500 | \n",
+ " 231.923000 | \n",
+ "
\n",
+ " \n",
+ " 20800 | \n",
+ " 0.986900 | \n",
+ " 0.738709 | \n",
+ " 1.293300 | \n",
+ " 231.972000 | \n",
+ "
\n",
+ " \n",
+ " 20900 | \n",
+ " 1.004500 | \n",
+ " 0.737854 | \n",
+ " 1.288800 | \n",
+ " 232.769000 | \n",
+ "
\n",
+ " \n",
+ " 21000 | \n",
+ " 0.996200 | \n",
+ " 0.738131 | \n",
+ " 1.281100 | \n",
+ " 234.165000 | \n",
+ "
\n",
+ " \n",
+ " 21100 | \n",
+ " 0.994800 | \n",
+ " 0.737721 | \n",
+ " 1.279400 | \n",
+ " 234.477000 | \n",
+ "
\n",
+ " \n",
+ " 21200 | \n",
+ " 0.990400 | \n",
+ " 0.738066 | \n",
+ " 1.286100 | \n",
+ " 233.268000 | \n",
+ "
\n",
+ " \n",
+ " 21300 | \n",
+ " 1.001100 | \n",
+ " 0.738174 | \n",
+ " 1.287700 | \n",
+ " 232.973000 | \n",
+ "
\n",
+ " \n",
+ "
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ " [38/38 00:01]\n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'eval_loss': 0.7315998077392578,\n",
+ " 'eval_runtime': 1.383,\n",
+ " 'eval_samples_per_second': 216.921,\n",
+ " 'epoch': 3.0}"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "trainer.evaluate()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.eval();"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "the Trump administration's internal policy is nonsense.\n",
+ "the internal policy of Donald Trump is nonsense.\n",
+ "the internal policy of President Trump is nonsense.\n",
+ "the Trump administration's internal policy is crazy.\n",
+ "the president's internal policy is nonsense.\n",
+ "the internal policy of Mr. Trump is nonsense.\n",
+ "the Trump administration's internal policy is stupid.\n",
+ "the internal policy of Trump is nonsense.\n",
+ "the Trump administration's internal policy is bad.\n",
+ "the Trump internal policy is nonsense.\n"
+ ]
+ }
+ ],
+ "source": [
+ "inputs = tokenizer('The internal policy of the fucking Trump is stupid.', return_tensors='pt')\n",
+ "inputs = {k: v.to(device) for k, v in inputs.items()}\n",
+ "for t in model.generate(**inputs, num_return_sequences=10, do_sample=False, num_beams=10):\n",
+ " print(tokenizer.decode(t, skip_special_tokens=True))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.save_pretrained(save_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/emnlp2021/style_transfer/mining_parallel_corpus/paranmt_mining.ipynb b/emnlp2021/style_transfer/mining_parallel_corpus/paranmt_mining.ipynb
new file mode 100644
index 0000000..409827a
--- /dev/null
+++ b/emnlp2021/style_transfer/mining_parallel_corpus/paranmt_mining.ipynb
@@ -0,0 +1,1153 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Before starting the mining process, please download the corpus from https://www.cs.cmu.edu/~jwieting/ "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fn = '/home/dale/data/paraphrase_corpora/para-nmt-50m/para-nmt-50m.txt'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import csv\n",
+ "pd.options.display.max_colwidth = 150"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "os.environ['CUDA_VISIBLE_DEVICES'] = '4'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(fn, sep='\\t', header=None, nrows=1_000_000, encoding='utf-8', quoting=csv.QUOTE_NONE).dropna()\n",
+ "df.columns = ['reference', 'translation', 'similarity']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def lenth_diff(row):\n",
+ " l1 = len(row.reference)\n",
+ " l2 = len(row.translation)\n",
+ " return np.abs(l1-l2) / (max(l1, l2) + 1)\n",
+ "\n",
+ "df['lenght_diff'] = df.apply(lenth_diff, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.7133114266228533\n",
+ "0.8754667509335019\n",
+ "0.8481736963473927\n",
+ "0.5371920743841487\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(np.mean((df.similarity > 0.6)))\n",
+ "print(np.mean(df.similarity <= 0.95))\n",
+ "print(np.mean(df.lenght_diff <= 0.4))\n",
+ "print(np.mean((df.similarity > 0.6) & (df.similarity <= 0.95) & (df.lenght_diff <= 0.4)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1000, 4)"
+ ]
+ },
+ "execution_count": 88,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nonsim = df[(df.similarity > 0.6) & (df.similarity <= 0.95) & (df.lenght_diff <= 0.4)].head(1000)\n",
+ "nonsim.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from tqdm.auto import tqdm, trange\n",
+ "from transformers import RobertaTokenizer, RobertaForSequenceClassification\n",
+ "import torch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_name = 'SkolkovoInstitute/roberta_toxicity_classifier_v1'\n",
+ "\n",
+ "tokenizer = RobertaTokenizer.from_pretrained(model_name)\n",
+ "model = RobertaForSequenceClassification.from_pretrained(model_name)\n",
+ "\n",
+ "\n",
+ "def classify_preds(preds, batch_size=32, soft=True, threshold=0.5, soft=False):\n",
+ " single = False\n",
+ " if isinstance(preds, str):\n",
+ " preds = [preds]\n",
+ " single = True\n",
+ " results = []\n",
+ " \n",
+ " f = trange if verbose else range\n",
+ "\n",
+ " for i in f(0, len(preds), batch_size)):\n",
+ " batch = tokenizer(preds[i:i + batch_size], return_tensors='pt', padding=True)\n",
+ " with torch.inference_mode():\n",
+ " logits = model(**batch).logits\n",
+ " if soft:\n",
+ " result = torch.softmax(logits, -1)[:, 1].cpu().numpy()\n",
+ " else:\n",
+ " result = (logits[:, 1] > threshold).cpu().numpy()\n",
+ " results.extend([1 - item for item in result])\n",
+ " if single:\n",
+ " return np.mean(results)\n",
+ " return results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "febffe55f3794fb484861b9a9b56e66c",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/dale/p3/lib/python3.7/site-packages/pandas/core/frame.py:1490: FutureWarning: Using short name for 'orient' is deprecated. Only the options: ('dict', list, 'series', 'split', 'records', 'index') will be used in a future version. Use one of the above to silence this warning.\n",
+ " FutureWarning,\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/dale/p3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \"\"\"Entry point for launching an IPython kernel.\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d64463b92f054f37a19e07063796e5da",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(FloatProgress(value=0.0, max=52.0), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/dale/p3/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " \n"
+ ]
+ }
+ ],
+ "source": [
+ "nonsim['ref_tox'] = classify_preds(nonsim.reference.tolist(), verbose=True, batch_size=64)\n",
+ "nonsim['trn_tox'] = classify_preds(nonsim.translation.tolist(), verbose=True, batch_size=64)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPZElEQVR4nO3df6zd9V3H8ed7XAHhspbR2ZC27rKsmzZtVLiBLiTz3nUxpTOUREZY2ChLtdlkiDITqvtjRmOEPxgBQqaNLCum7sLqYhsG6izckC222g6kUJy7sMJaazugVMsPB/HtH+cD1nrbc3rvOefL/dznI7np98fne77v9z2X1/3ez/meQ2QmkqS6vKvpAiRJ3We4S1KFDHdJqpDhLkkVMtwlqUIDTRcAMG/evBwaGprSsa+88gpnn312dwt6h7Pn2cGeZ4fp9Lxr164XMvO9k+17R4T70NAQO3funNKx4+PjjIyMdLegdzh7nh3seXaYTs8R8dyJ9jktI0kVMtwlqUKGuyRVyHCXpAoZ7pJUIcNdkipkuEtShQx3SaqQ4S5JFXpHvEN1OnbvP8J167/VyLn33vLxRs4rSe145S5JFTLcJalChrskVchwl6QKGe6SVCHDXZIqZLhLUoUMd0mqkOEuSRUy3CWpQoa7JFXIcJekChnuklQhw12SKmS4S1KFDHdJqpDhLkkVMtwlqUKGuyRVyHCXpAoZ7pJUIcNdkipkuEtShQx3SaqQ4S5JFTLcJalChrskVaijcI+I34mIpyLiyYj4ekScGREXRMSOiJiIiPsi4vQy9oyyPlH2D/W0A0nS/9M23CNiAfBbwHBmLgVOA64GbgVuz8wPAIeBteWQtcDhsv32Mk6S1EedTssMAD8dEQPAWcAB4KPA5rJ/I3BFWV5d1in7V0REdKVaSVJHIjPbD4q4Efhj4DXg74Abge3l6pyIWAQ8lJlLI+JJYGVm7iv7ngEuycwXjnvMdcA6gPnz5180NjY2pQYOvXSEg69N6dBpW7ZgTiPnPXr0KIODg42cuyn2PDvY86kZHR3dlZnDk+0baHdwRJxL62r8AuBl4BvAyilVcozM3ABsABgeHs6RkZEpPc5dm7Zw2+62bfTE3mtGGjnv+Pg4U/1+zVT2PDvYc/d0Mi3zMeCHmfnjzHwD+CZwKTC3TNMALAT2l+X9wCKAsn8O8GJXq5YknVQn4f48sDwizipz5yuAPcAjwJVlzBpgS1neWtYp+x/OTuZ+JEld0zbcM3MHrRdGvwfsLsdsAG4GboqICeA84J5yyD3AeWX7TcD6HtQtSTqJjiarM/NLwJeO2/wscPEkY18HPjH90iRJU+U7VCWpQoa7JFXIcJekChnuklQhw12SKmS4S1KFDHdJqpDhLkkVMtwlqUKGuyRVyHCXpAoZ7pJUIcNdkipkuEtShQx3SaqQ4S5JFTLcJalChrskVchwl6QKGe6SVCHDXZIqZLhLUoUMd0mqkOEuSRUy3CWpQoa7JFXIcJekChnuklQhw12SKmS4S1KFDHdJqpDhLkkVMtwlqUKGuyRVyHCXpAp1FO4RMTciNkfEv0TE0xHx4Yh4T0R8OyJ+UP49t4yNiLgzIiYi4omIuLC3LUiSjtfplfsdwN9k5s8BvwA8DawHtmXmYmBbWQe4DFhcvtYBX+lqxZKkttqGe0TMAT4C3AOQmT/JzJeB1cDGMmwjcEVZXg3cmy3bgbkRcX6X65YknURk5skHRPwisAHYQ+uqfRdwI7A/M+eWMQEczsy5EfEAcEtmfqfs2wbcnJk7j3vcdbSu7Jk/f/5FY2NjU2rg0EtHOPjalA6dtmUL5jRy3qNHjzI4ONjIuZtiz7ODPZ+a0dHRXZk5PNm+gQ6OHwAuBG7IzB0RcQf/OwUDQGZmRJz8t8RxMnMDrV8aDA8P58jIyKkc/ra7Nm3htt2dtNF9e68ZaeS84+PjTPX7NVPZ8+xgz93TyZz7PmBfZu4o65tphf3Bt6Zbyr+Hyv79wKJjjl9YtkmS+qRtuGfmvwM/iogPlU0raE3RbAXWlG1rgC1leStwbblrZjlwJDMPdLdsSdLJdDqfcQOwKSJOB54FPkPrF8P9EbEWeA64qox9EFgFTACvlrGSpD7qKNwz83Fgskn7FZOMTeD66ZUlSZoO36EqSRUy3CWpQoa7JFXIcJekChnuklQhw12SKmS4S1KFDHdJqpDhLkkVMtwlqUKGuyRVyHCXpAoZ7pJUIcNdkipkuEtShQx3SaqQ4S5JFTLcJalChrskVchwl6QKGe6SVCHDXZIqZLhLUoUMd0mqkOEuSRUy3CWpQoa7JFXIcJekChnuklQhw12SKmS4S1KFDHdJqpDhLkkVMtwlqUKGuyRVqONwj4jTIuKxiHigrF8QETsiYiIi7ouI08v2M8r6RNk/1KPaJUkncCpX7jcCTx+zfitwe2Z+ADgMrC3b1wKHy/bbyzhJUh91FO4RsRD4OPDnZT2AjwKby5CNwBVleXVZp+xfUcZLkvokMrP9oIjNwJ8A5wC/C1wHbC9X50TEIuChzFwaEU8CKzNzX9n3DHBJZr5w3GOuA9YBzJ8//6KxsbEpNXDopSMcfG1Kh07bsgVzGjnv0aNHGRwcbOTcTbHn2cGeT83o6OiuzByebN9Au4Mj4leBQ5m5KyJGplTBJDJzA7ABYHh4OEdGpvbQd23awm2727bRE3uvGWnkvOPj40z1+zVT2fPsYM/d00kqXgpcHhGrgDOBdwN3AHMjYiAz3wQWAvvL+P3AImBfRAwAc4AXu165JOmE2s65Z+bvZebCzBwCrgYezsxrgEeAK8uwNcCWsry1rFP2P5ydzP1IkrpmOve53wzcFBETwHnAPWX7PcB5ZftNwPrplShJOlWnNFmdmePAeFl+Frh4kjGvA5/oQm2SpCnyHaqSVCHDXZIqZLhLUoUMd0mqkOEuSRUy3CWpQoa7JFXIcJekChnuklQhw12SKmS4S1KFDHdJqpDhLkkVMtwlqUKGuyRVyHCXpAoZ7pJUIcNdkipkuEtShQx3SaqQ4S5JFTLcJalChrskVchwl6QKGe6SVCHDXZIqZLhLUoUMd0mqkOEuSRUy3CWpQoa7JFXIcJekChnuklQhw12SKmS4S1KF2oZ7RCyKiEciYk9EPBURN5bt74mIb0fED8q/55btERF3RsRERDwRERf2uglJ0v/VyZX7m8AXMnMJsBy4PiKWAOuBbZm5GNhW1gEuAxaXr3XAV7petSTppNqGe2YeyMzvleX/BJ4GFgCrgY1l2EbgirK8Grg3W7YDcyPi/G4XLkk6scjMzgdHDAGPAkuB5zNzbtkewOHMnBsRDwC3ZOZ3yr5twM2ZufO4x1pH68qe+fPnXzQ2NjalBg69dISDr03p0GlbtmBOI+c9evQog4ODjZy7KfY8O9jzqRkdHd2VmcOT7Rvo9EEiYhD4K+C3M/M/WnnekpkZEZ3/lmgdswHYADA8PJwjIyOncvjb7tq0hdt2d9xGV+29ZqSR846PjzPV79dMZc+zgz13T0d3y0TET9EK9k2Z+c2y+eBb0y3l30Nl+35g0TGHLyzbJEl90sndMgHcAzydmV8+ZtdWYE1ZXgNsOWb7teWumeXAkcw80MWaJUltdDKfcSnwaWB3RDxetv0+cAtwf0SsBZ4Drir7HgRWARPAq8BnulmwJKm9tuFeXhiNE+xeMcn4BK6fZl2SpGnwHaqSVCHDXZIqZLhLUoUMd0mqkOEuSRUy3CWpQoa7JFXIcJekChnuklQhw12SKmS4S1KFDHdJqpDhLkkVMtwlqUKGuyRVyHCXpAoZ7pJUIcNdkipkuEtShQx3SaqQ4S5JFTLcJalChrskVchwl6QKGe6SVCHDXZIqZLhLUoUMd0mqkOEuSRUaaLoASWra0PpvNXbur608uyeP65W7JFXIcJekChnuklQhw12SKmS4S1KFehLuEbEyIr4fERMRsb4X55AknVjXwz0iTgPuBi4DlgCfjIgl3T6PJOnEenGf+8XARGY+CxARY8BqYE8PztWopu6N7dV9sZ1oqucvLHuT6xq8F7kJTT3PTd7zPRuf516JzOzuA0ZcCazMzF8v658GLsnMzx83bh2wrqx+CPj+FE85D3hhisfOVPY8O9jz7DCdnt+Xme+dbEdj71DNzA3Ahuk+TkTszMzhLpQ0Y9jz7GDPs0Oveu7FC6r7gUXHrC8s2yRJfdKLcP8nYHFEXBARpwNXA1t7cB5J0gl0fVomM9+MiM8DfwucBnw1M5/q9nmOMe2pnRnInmcHe54detJz119QlSQ1z3eoSlKFDHdJqtCMCfd2H2kQEWdExH1l/46IGGqgzK7qoOebImJPRDwREdsi4n1N1NlNnX50RUT8WkRkRMz42+Y66TkirirP9VMR8Zf9rrHbOvjZ/tmIeCQiHis/36uaqLNbIuKrEXEoIp48wf6IiDvL9+OJiLhw2ifNzHf8F60XZp8B3g+cDvwzsOS4Mb8J/GlZvhq4r+m6+9DzKHBWWf7cbOi5jDsHeBTYDgw3XXcfnufFwGPAuWX9Z5quuw89bwA+V5aXAHubrnuaPX8EuBB48gT7VwEPAQEsB3ZM95wz5cr97Y80yMyfAG99pMGxVgMby/JmYEVERB9r7La2PWfmI5n5alndTus9BTNZJ88zwB8BtwKv97O4Humk598A7s7MwwCZeajPNXZbJz0n8O6yPAf4tz7W13WZ+Sjw0kmGrAbuzZbtwNyIOH8655wp4b4A+NEx6/vKtknHZOabwBHgvL5U1xud9HystbR+889kbXsuf64uysxaPoCkk+f5g8AHI+K7EbE9Ilb2rbre6KTnPwA+FRH7gAeBG/pTWmNO9b/3tvwfZFcgIj4FDAO/3HQtvRQR7wK+DFzXcCn9NkBramaE1l9nj0bEssx8ucmieuyTwNcy87aI+DDwFxGxNDP/u+nCZoqZcuXeyUcavD0mIgZo/Sn3Yl+q642OPsYhIj4GfBG4PDP/q0+19Uq7ns8BlgLjEbGX1tzk1hn+omonz/M+YGtmvpGZPwT+lVbYz1Sd9LwWuB8gM/8BOJPWB2zVqusf2zJTwr2TjzTYCqwpy1cCD2d5pWKGattzRPwS8Ge0gn2mz8NCm54z80hmzsvMocwcovU6w+WZubOZcruik5/tv6Z11U5EzKM1TfNsH2vstk56fh5YARARP08r3H/c1yr7aytwbblrZjlwJDMPTOsRm34V+RRebV5F64rlGeCLZdsf0vqPG1pP/jeACeAfgfc3XXMfev574CDwePna2nTNve75uLHjzPC7ZTp8noPWdNQeYDdwddM196HnJcB3ad1J8zjwK03XPM1+vw4cAN6g9ZfYWuCzwGePeY7vLt+P3d34ufbjBySpQjNlWkaSdAoMd0mqkOEuSRUy3CWpQoa7JFXIcJekChnuklSh/wFzrK5YyWChsAAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "nonsim.trn_tox.hist()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* We filter sentences with moderate similarity (60% to 95%) and similar length (no more then 40% difference) - 50%% of corpus\n",
+ "* Out of them, 2% have estimated toxicity is very different between the source and the translation (20 out of 1000)\n",
+ "* Out of these 18:\n",
+ " * 2 are false paraphrases\n",
+ " * 8 in fact have comparable toxicity\n",
+ " * 10 are valid detoxifications"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.02\n",
+ "20\n"
+ ]
+ }
+ ],
+ "source": [
+ "fltr = np.abs(nonsim.trn_tox-nonsim.ref_tox) > 0.5\n",
+ "print(np.mean(fltr))\n",
+ "print(np.sum(fltr))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.options.display.max_colwidth = 500"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " reference | \n",
+ " translation | \n",
+ " similarity | \n",
+ " lenght_diff | \n",
+ " ref_tox | \n",
+ " trn_tox | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters. | \n",
+ " if Alkar floods her with her mental waste, it would explain the high levels of neurotransmitter. | \n",
+ " 0.785171 | \n",
+ " 0.010309 | \n",
+ " 0.014195 | \n",
+ " 0.981983 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Now you're getting nasty. | \n",
+ " you're becoming disgusting. | \n",
+ " 0.749687 | \n",
+ " 0.071429 | \n",
+ " 0.065473 | \n",
+ " 0.999039 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Well, we could spare your life, for one. | \n",
+ " well, we can spare your life. | \n",
+ " 0.919051 | \n",
+ " 0.268293 | \n",
+ " 0.213313 | \n",
+ " 0.985068 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Ah! Monkey, you've got to snap out of it. | \n",
+ " monkey, you have to wake up. | \n",
+ " 0.664333 | \n",
+ " 0.309524 | \n",
+ " 0.053362 | \n",
+ " 0.994215 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " I've got orders to put her down. | \n",
+ " I have orders to kill her. | \n",
+ " 0.726639 | \n",
+ " 0.181818 | \n",
+ " 0.009402 | \n",
+ " 0.999348 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " I'm not gonna have a child... ...with the same genetic disorder as me who's gonna die. L... | \n",
+ " I'm not going to breed kids with a genetic disorder that makes them die. | \n",
+ " 0.703185 | \n",
+ " 0.206522 | \n",
+ " 0.950956 | \n",
+ " 0.035846 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " They're all laughing at us, so we'll kick your ass. | \n",
+ " they're laughing at us. We'll show you. | \n",
+ " 0.618866 | \n",
+ " 0.230769 | \n",
+ " 0.999492 | \n",
+ " 0.000131 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Maine was very short on black people back then. | \n",
+ " there wasn't much black in Maine then. | \n",
+ " 0.720482 | \n",
+ " 0.187500 | \n",
+ " 0.963680 | \n",
+ " 0.148710 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Briggs, what the hell's happening? | \n",
+ " Briggs, what the hell is going on? | \n",
+ " 0.920373 | \n",
+ " 0.000000 | \n",
+ " 0.159096 | \n",
+ " 0.841071 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Another one simply had no clue what to do, so whenever he met my brother he'd beat the crap out of him, and then say: | \n",
+ " another simply didn't know what to do, so whenever he met my brother, he nearly beat the shit out of him. | \n",
+ " 0.877540 | \n",
+ " 0.101695 | \n",
+ " 0.055371 | \n",
+ " 0.930472 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " I suppose you want me to buy you flowers and chocolates and whisper sweet nothings. | \n",
+ " you'd probably want me to buy you some chocolates and flowers... and whispered some pretty rubbish. | \n",
+ " 0.800661 | \n",
+ " 0.160000 | \n",
+ " 0.000078 | \n",
+ " 0.980341 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " So now their spirits are cursed, walking back roads, waterways, and if they find an unfaithful man, they kill him, and that man is never seen again. | \n",
+ " their souls are cursed, they guard the paths, he says, and when they encounter an unfaithful man, he will be killed, and his body will never be found. | \n",
+ " 0.755883 | \n",
+ " 0.013245 | \n",
+ " 0.842509 | \n",
+ " 0.143992 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Freezing him. | \n",
+ " I'll freeze him! | \n",
+ " 0.775646 | \n",
+ " 0.176471 | \n",
+ " 0.007186 | \n",
+ " 0.573710 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Come on, Cal, leave that shit alone. | \n",
+ " come on, Cal, put it down. | \n",
+ " 0.660481 | \n",
+ " 0.270270 | \n",
+ " 0.999637 | \n",
+ " 0.000279 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " So he's the Top dog. | \n",
+ " he's the tallest son of a bitch. | \n",
+ " 0.611092 | \n",
+ " 0.363636 | \n",
+ " 0.000920 | \n",
+ " 0.999639 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " I swore when I went out with Xander Harris... ...I'd rather die than datea fixer-upper again. | \n",
+ " when I was dating Alex Harris, I swore I'd rather die than go out with a loser. | \n",
+ " 0.790565 | \n",
+ " 0.148936 | \n",
+ " 0.011613 | \n",
+ " 0.996266 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " I'm famous, and you're done. | \n",
+ " I'm famous, and you're dead. | \n",
+ " 0.817253 | \n",
+ " 0.000000 | \n",
+ " 0.000926 | \n",
+ " 0.979738 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " To quote Jake Oppenheimer: I, who am about to die, must seem to them something \"God-awful.\"... | \n",
+ " to quote Jake and Oppenheimer: \"I must die, I must feel like a terrible god.\" | \n",
+ " 0.699590 | \n",
+ " 0.178947 | \n",
+ " 0.002348 | \n",
+ " 0.682655 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " “Could you please be quiet, Miss Lavish?” said Moist. | \n",
+ " 'could you keep your mouth shut, Miss Opulent? 'Said Moist. | \n",
+ " 0.808943 | \n",
+ " 0.100000 | \n",
+ " 0.000187 | \n",
+ " 0.760356 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Murder for hire. | \n",
+ " murder to order. | \n",
+ " 0.697667 | \n",
+ " 0.000000 | \n",
+ " 0.074589 | \n",
+ " 0.962326 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " reference \\\n",
+ "0 If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters. \n",
+ "1 Now you're getting nasty. \n",
+ "2 Well, we could spare your life, for one. \n",
+ "3 Ah! Monkey, you've got to snap out of it. \n",
+ "4 I've got orders to put her down. \n",
+ "5 I'm not gonna have a child... ...with the same genetic disorder as me who's gonna die. L... \n",
+ "6 They're all laughing at us, so we'll kick your ass. \n",
+ "7 Maine was very short on black people back then. \n",
+ "8 Briggs, what the hell's happening? \n",
+ "9 Another one simply had no clue what to do, so whenever he met my brother he'd beat the crap out of him, and then say: \n",
+ "10 I suppose you want me to buy you flowers and chocolates and whisper sweet nothings. \n",
+ "11 So now their spirits are cursed, walking back roads, waterways, and if they find an unfaithful man, they kill him, and that man is never seen again. \n",
+ "12 Freezing him. \n",
+ "13 Come on, Cal, leave that shit alone. \n",
+ "14 So he's the Top dog. \n",
+ "15 I swore when I went out with Xander Harris... ...I'd rather die than datea fixer-upper again. \n",
+ "16 I'm famous, and you're done. \n",
+ "17 To quote Jake Oppenheimer: I, who am about to die, must seem to them something \"God-awful.\"... \n",
+ "18 “Could you please be quiet, Miss Lavish?” said Moist. \n",
+ "19 Murder for hire. \n",
+ "\n",
+ " translation \\\n",
+ "0 if Alkar floods her with her mental waste, it would explain the high levels of neurotransmitter. \n",
+ "1 you're becoming disgusting. \n",
+ "2 well, we can spare your life. \n",
+ "3 monkey, you have to wake up. \n",
+ "4 I have orders to kill her. \n",
+ "5 I'm not going to breed kids with a genetic disorder that makes them die. \n",
+ "6 they're laughing at us. We'll show you. \n",
+ "7 there wasn't much black in Maine then. \n",
+ "8 Briggs, what the hell is going on? \n",
+ "9 another simply didn't know what to do, so whenever he met my brother, he nearly beat the shit out of him. \n",
+ "10 you'd probably want me to buy you some chocolates and flowers... and whispered some pretty rubbish. \n",
+ "11 their souls are cursed, they guard the paths, he says, and when they encounter an unfaithful man, he will be killed, and his body will never be found. \n",
+ "12 I'll freeze him! \n",
+ "13 come on, Cal, put it down. \n",
+ "14 he's the tallest son of a bitch. \n",
+ "15 when I was dating Alex Harris, I swore I'd rather die than go out with a loser. \n",
+ "16 I'm famous, and you're dead. \n",
+ "17 to quote Jake and Oppenheimer: \"I must die, I must feel like a terrible god.\" \n",
+ "18 'could you keep your mouth shut, Miss Opulent? 'Said Moist. \n",
+ "19 murder to order. \n",
+ "\n",
+ " similarity lenght_diff ref_tox trn_tox \n",
+ "0 0.785171 0.010309 0.014195 0.981983 \n",
+ "1 0.749687 0.071429 0.065473 0.999039 \n",
+ "2 0.919051 0.268293 0.213313 0.985068 \n",
+ "3 0.664333 0.309524 0.053362 0.994215 \n",
+ "4 0.726639 0.181818 0.009402 0.999348 \n",
+ "5 0.703185 0.206522 0.950956 0.035846 \n",
+ "6 0.618866 0.230769 0.999492 0.000131 \n",
+ "7 0.720482 0.187500 0.963680 0.148710 \n",
+ "8 0.920373 0.000000 0.159096 0.841071 \n",
+ "9 0.877540 0.101695 0.055371 0.930472 \n",
+ "10 0.800661 0.160000 0.000078 0.980341 \n",
+ "11 0.755883 0.013245 0.842509 0.143992 \n",
+ "12 0.775646 0.176471 0.007186 0.573710 \n",
+ "13 0.660481 0.270270 0.999637 0.000279 \n",
+ "14 0.611092 0.363636 0.000920 0.999639 \n",
+ "15 0.790565 0.148936 0.011613 0.996266 \n",
+ "16 0.817253 0.000000 0.000926 0.979738 \n",
+ "17 0.699590 0.178947 0.002348 0.682655 \n",
+ "18 0.808943 0.100000 0.000187 0.760356 \n",
+ "19 0.697667 0.000000 0.074589 0.962326 "
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nonsim[fltr].reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Large scale fine tuning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "chunksize = 3_000"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv(fn, sep='\\t', header=None, nrows=1_000_000, encoding='utf-8', quoting=csv.QUOTE_NONE).dropna()\n",
+ "df.columns = ['reference', 'translation', 'similarity']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 123,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import warnings\n",
+ "warnings.simplefilter(action='ignore', category=FutureWarning)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "* 50M / 3K = 16K iterations\n",
+ "* with 1 minute/iteration, the job will take 11 days, but with free GPUs this is faster"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "3803079445c04b6186b32195ac10fa9f",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "380 samples\n",
+ "689 samples\n",
+ "1025 samples\n",
+ "1353 samples\n",
+ "1663 samples\n",
+ "2012 samples\n",
+ "2353 samples\n",
+ "2696 samples\n",
+ "3033 samples\n",
+ "3372 samples\n",
+ "3709 samples\n",
+ "4040 samples\n",
+ "4385 samples\n",
+ "4702 samples\n",
+ "5046 samples\n",
+ "5405 samples\n",
+ "5742 samples\n",
+ "6046 samples\n",
+ "6374 samples\n",
+ "6698 samples\n",
+ "7042 samples\n",
+ "7386 samples\n",
+ "7718 samples\n",
+ "8015 samples\n",
+ "8374 samples\n",
+ "8679 samples\n",
+ "9031 samples\n",
+ "9371 samples\n",
+ "9735 samples\n",
+ "10066 samples\n",
+ "10385 samples\n",
+ "10728 samples\n",
+ "11046 samples\n",
+ "11395 samples\n",
+ "11748 samples\n",
+ "12081 samples\n",
+ "12414 samples\n"
+ ]
+ }
+ ],
+ "source": [
+ "results = []\n",
+ "\n",
+ "for i, chunk in enumerate(tqdm(pd.read_csv(fn, sep='\\t', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE, chunksize=chunksize))):\n",
+ " chunk.dropna(inplace=True)\n",
+ " chunk.columns = ['reference', 'translation', 'similarity']\n",
+ " chunk['lenght_diff'] = chunk.apply(lenth_diff, axis=1)\n",
+ " nonsim = chunk[(chunk.similarity > 0.6) & (chunk.similarity <= 0.95) & (chunk.lenght_diff <= 0.4)].copy()\n",
+ " \n",
+ " nonsim['ref_tox'] = classify_preds(nonsim.reference.tolist(), verbose=False, batch_size=64)\n",
+ " nonsim['trn_tox'] = classify_preds(nonsim.translation.tolist(), verbose=False, batch_size=64)\n",
+ " \n",
+ " fltr = np.abs(nonsim.trn_tox-nonsim.ref_tox) > 0.5\n",
+ " mined = nonsim[fltr]\n",
+ " results.append(mined)\n",
+ " # print(nonsim.shape[0], mined.shape[0])\n",
+ " if i > 0 and i % 10 == 0:\n",
+ " res_df = pd.concat(results, ignore_index=True)\n",
+ " print(res_df.shape[0], 'samples')\n",
+ " res_df.to_csv('filtered.tsv', sep='\\t', encoding='utf-8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 127,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1"
+ ]
+ },
+ "execution_count": 127,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 128,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1"
+ ]
+ },
+ "execution_count": 128,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 129,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(577988, 6)"
+ ]
+ },
+ "execution_count": 129,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res_df = pd.concat(results, ignore_index=True)\n",
+ "res_df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 130,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " similarity | \n",
+ " lenght_diff | \n",
+ " ref_tox | \n",
+ " trn_tox | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 577988.000000 | \n",
+ " 577988.000000 | \n",
+ " 577988.000000 | \n",
+ " 577988.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 0.758466 | \n",
+ " 0.157654 | \n",
+ " 0.541382 | \n",
+ " 0.434479 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 0.092696 | \n",
+ " 0.108056 | \n",
+ " 0.457569 | \n",
+ " 0.458904 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 0.600001 | \n",
+ " 0.000000 | \n",
+ " 0.000033 | \n",
+ " 0.000033 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 0.681099 | \n",
+ " 0.066667 | \n",
+ " 0.012172 | \n",
+ " 0.000707 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 0.754436 | \n",
+ " 0.141818 | \n",
+ " 0.806834 | \n",
+ " 0.085110 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 0.831241 | \n",
+ " 0.238095 | \n",
+ " 0.990469 | \n",
+ " 0.973742 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 0.950000 | \n",
+ " 0.400000 | \n",
+ " 0.999724 | \n",
+ " 0.999730 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " similarity lenght_diff ref_tox trn_tox\n",
+ "count 577988.000000 577988.000000 577988.000000 577988.000000\n",
+ "mean 0.758466 0.157654 0.541382 0.434479\n",
+ "std 0.092696 0.108056 0.457569 0.458904\n",
+ "min 0.600001 0.000000 0.000033 0.000033\n",
+ "25% 0.681099 0.066667 0.012172 0.000707\n",
+ "50% 0.754436 0.141818 0.806834 0.085110\n",
+ "75% 0.831241 0.238095 0.990469 0.973742\n",
+ "max 0.950000 0.400000 0.999724 0.999730"
+ ]
+ },
+ "execution_count": 130,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res_df.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 131,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " reference | \n",
+ " translation | \n",
+ " similarity | \n",
+ " lenght_diff | \n",
+ " ref_tox | \n",
+ " trn_tox | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 88294 | \n",
+ " I just beat up three guys by myself. | \n",
+ " I just cut three big men. | \n",
+ " 0.685685 | \n",
+ " 0.297297 | \n",
+ " 0.750938 | \n",
+ " 0.005162 | \n",
+ "
\n",
+ " \n",
+ " 402968 | \n",
+ " Whoo! Welcome back, mad dog! | \n",
+ " welcome back, Fury! | \n",
+ " 0.735353 | \n",
+ " 0.310345 | \n",
+ " 0.998412 | \n",
+ " 0.000081 | \n",
+ "
\n",
+ " \n",
+ " 396420 | \n",
+ " The roof is scraping my nut! | \n",
+ " the roof is cutting off my balls! | \n",
+ " 0.746109 | \n",
+ " 0.147059 | \n",
+ " 0.012407 | \n",
+ " 0.993682 | \n",
+ "
\n",
+ " \n",
+ " 344928 | \n",
+ " Blasted fairy music. | \n",
+ " to hell with the music. | \n",
+ " 0.667527 | \n",
+ " 0.125000 | \n",
+ " 0.001486 | \n",
+ " 0.932651 | \n",
+ "
\n",
+ " \n",
+ " 57239 | \n",
+ " A kidnapper and a child murderer, yes. | \n",
+ " the kidnapper and the child killer, yes. | \n",
+ " 0.888592 | \n",
+ " 0.048780 | \n",
+ " 0.967770 | \n",
+ " 0.121575 | \n",
+ "
\n",
+ " \n",
+ " 444506 | \n",
+ " For awhile, I went to sex clubs. | \n",
+ " I went to sex clubs for a while. | \n",
+ " 0.923540 | \n",
+ " 0.000000 | \n",
+ " 0.147675 | \n",
+ " 0.966582 | \n",
+ "
\n",
+ " \n",
+ " 22691 | \n",
+ " When I released her she bites you. | \n",
+ " if I leave her, she'll bite you. | \n",
+ " 0.675012 | \n",
+ " 0.057143 | \n",
+ " 0.006479 | \n",
+ " 0.979373 | \n",
+ "
\n",
+ " \n",
+ " 242403 | \n",
+ " If you don't at least try, Alex, you're already a loser. | \n",
+ " if you don't try, Alex, you lost in advance. | \n",
+ " 0.727835 | \n",
+ " 0.210526 | \n",
+ " 0.998164 | \n",
+ " 0.000176 | \n",
+ "
\n",
+ " \n",
+ " 179191 | \n",
+ " 'I know you're only the humorous thugs. | \n",
+ " \"I'm sure you're only a couple of comic thieves. | \n",
+ " 0.621704 | \n",
+ " 0.183673 | \n",
+ " 0.924046 | \n",
+ " 0.265508 | \n",
+ "
\n",
+ " \n",
+ " 541902 | \n",
+ " You know as well as I do what a monster he was. | \n",
+ " you know as much as I did for the monster. | \n",
+ " 0.700937 | \n",
+ " 0.104167 | \n",
+ " 0.995559 | \n",
+ " 0.002968 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " reference \\\n",
+ "88294 I just beat up three guys by myself. \n",
+ "402968 Whoo! Welcome back, mad dog! \n",
+ "396420 The roof is scraping my nut! \n",
+ "344928 Blasted fairy music. \n",
+ "57239 A kidnapper and a child murderer, yes. \n",
+ "444506 For awhile, I went to sex clubs. \n",
+ "22691 When I released her she bites you. \n",
+ "242403 If you don't at least try, Alex, you're already a loser. \n",
+ "179191 'I know you're only the humorous thugs. \n",
+ "541902 You know as well as I do what a monster he was. \n",
+ "\n",
+ " translation similarity \\\n",
+ "88294 I just cut three big men. 0.685685 \n",
+ "402968 welcome back, Fury! 0.735353 \n",
+ "396420 the roof is cutting off my balls! 0.746109 \n",
+ "344928 to hell with the music. 0.667527 \n",
+ "57239 the kidnapper and the child killer, yes. 0.888592 \n",
+ "444506 I went to sex clubs for a while. 0.923540 \n",
+ "22691 if I leave her, she'll bite you. 0.675012 \n",
+ "242403 if you don't try, Alex, you lost in advance. 0.727835 \n",
+ "179191 \"I'm sure you're only a couple of comic thieves. 0.621704 \n",
+ "541902 you know as much as I did for the monster. 0.700937 \n",
+ "\n",
+ " lenght_diff ref_tox trn_tox \n",
+ "88294 0.297297 0.750938 0.005162 \n",
+ "402968 0.310345 0.998412 0.000081 \n",
+ "396420 0.147059 0.012407 0.993682 \n",
+ "344928 0.125000 0.001486 0.932651 \n",
+ "57239 0.048780 0.967770 0.121575 \n",
+ "444506 0.000000 0.147675 0.966582 \n",
+ "22691 0.057143 0.006479 0.979373 \n",
+ "242403 0.210526 0.998164 0.000176 \n",
+ "179191 0.183673 0.924046 0.265508 \n",
+ "541902 0.104167 0.995559 0.002968 "
+ ]
+ },
+ "execution_count": 131,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res_df.sample(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 132,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 132,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAD4CAYAAAAZ1BptAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVWUlEQVR4nO3df7DldX3f8ecrixiHREExdxiWunTcNF2lQd0BOuk0tzKFC53JYooOzFQWQ9xMhTbpbDuu6R9Ylan+YZihVdK17LA4RqQklm1YSxnkjpNOQDAgsFDDDWrZDUpkAbM6ate++8f5bHO83s+9h/vj3L3c52PmzP2e9/fz+X4+Hy7c1/3+uIdUFZIkzeVnVnsCkqTjlyEhSeoyJCRJXYaEJKnLkJAkdZ2w2hNYbqeeempt2rRpUX2/973vcdJJJy3vhI5zrnl9cM3rw1LW/JWvfOU7VfX62fWXXUhs2rSJBx98cFF9p6enmZycXN4JHedc8/rgmteHpaw5yTfnqnu5SZLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1PWy+4trSVpNm3bduWpj3zy1/B9D4pmEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSuhYMiSQ/m+TLSb6a5ECSf9fqZya5P8lMks8lObHVX9nez7T9m4aO9YFW/1qSC4fqU602k2TXUH3OMSRJ4zHKmcQPgbdX1S8DZwNTSc4DPgZcX1VvBJ4HrmrtrwKeb/XrWzuSbAEuA94ETAGfTLIhyQbgE8BFwBbg8taWecaQJI3BgiFRA0fa21e0VwFvB25v9b3AJW17W3tP239+krT6rVX1w6r6OjADnNNeM1X1VFX9CLgV2Nb69MaQJI3BSP/Tofbb/leANzL4rf8vgBeq6mhrchA4vW2fDjwNUFVHk7wIvK7V7xs67HCfp2fVz219emPMnt8OYAfAxMQE09PToyzrpxw5cmTRfdcq17w+uObx2XnW0YUbrZCVWPNIIVFVPwbOTnIy8Hngl5Z1FktUVbuB3QBbt26tycnJRR1nenqaxfZdq1zz+uCax+fKVf4/0y33ml/S001V9QJwL/D3gZOTHAuZjcChtn0IOAOg7X8N8NxwfVafXv25ecaQJI3BKE83vb6dQZDkVcA/Bp5gEBaXtmbbgTva9r72nrb/i1VVrX5Ze/rpTGAz8GXgAWBze5LpRAY3t/e1Pr0xJEljMMrlptOAve2+xM8At1XVHyd5HLg1yUeAh4CbWvubgE8nmQEOM/ihT1UdSHIb8DhwFLi6XcYiyTXAXcAGYE9VHWjHen9nDEnSGCwYElX1CPCWOepPMXgyaXb9B8A7O8e6Drhujvp+YP+oY0iSxsO/uJYkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSepaMCSSnJHk3iSPJzmQ5Ldb/YNJDiV5uL0uHurzgSQzSb6W5MKh+lSrzSTZNVQ/M8n9rf65JCe2+ivb+5m2f9Oyrl6SNK9RziSOAjuragtwHnB1ki1t3/VVdXZ77Qdo+y4D3gRMAZ9MsiHJBuATwEXAFuDyoeN8rB3rjcDzwFWtfhXwfKtf39pJksZkwZCoqmeq6s/a9l8DTwCnz9NlG3BrVf2wqr4OzADntNdMVT1VVT8CbgW2JQnwduD21n8vcMnQsfa27duB81t7SdIYvKR7Eu1yz1uA+1vpmiSPJNmT5JRWOx14eqjbwVbr1V8HvFBVR2fVf+JYbf+Lrb0kaQxOGLVhkp8D/hD4nar6bpIbgQ8D1b5+HPiNFZnlwnPbAewAmJiYYHp6elHHOXLkyKL7rlWueX1wzeOz86yjCzdaISux5pFCIskrGATEZ6rqjwCq6ttD+z8F/HF7ewg4Y6j7xlajU38OODnJCe1sYbj9sWMdTHIC8JrW/idU1W5gN8DWrVtrcnJylGX9lOnpaRbbd61yzeuDax6fK3fdOfYxj7l56qRlX/MoTzcFuAl4oqp+b6h+2lCzdwCPte19wGXtyaQzgc3Al4EHgM3tSaYTGdzc3ldVBdwLXNr6bwfuGDrW9rZ9KfDF1l6SNAajnEn8CvBu4NEkD7fa7zJ4OulsBpebvgH8FkBVHUhyG/A4gyejrq6qHwMkuQa4C9gA7KmqA+147wduTfIR4CEGoUT7+ukkM8BhBsEiSRqTBUOiqv4EmOuJov3z9LkOuG6O+v65+lXVUwyefppd/wHwzoXmKElaGf7FtSSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqWjAkkpyR5N4kjyc5kOS3W/21Se5O8mT7ekqrJ8kNSWaSPJLkrUPH2t7aP5lk+1D9bUkebX1uSJL5xpAkjccoZxJHgZ1VtQU4D7g6yRZgF3BPVW0G7mnvAS4CNrfXDuBGGPzAB64FzgXOAa4d+qF/I/DeoX5Trd4bQ5I0BguGRFU9U1V/1rb/GngCOB3YBuxtzfYCl7TtbcAtNXAfcHKS04ALgbur6nBVPQ/cDUy1fa+uqvuqqoBbZh1rrjEkSWNwwktpnGQT8BbgfmCiqp5pu74FTLTt04Gnh7odbLX56gfnqDPPGLPntYPBWQsTExNMT0+/lGX9f0eOHFl037XKNa8Prnl8dp51dOxjHrMSax45JJL8HPCHwO9U1XfbbQMAqqqS1LLObJb5xqiq3cBugK1bt9bk5OSixpienmaxfdcq17w+uObxuXLXnWMf85ibp05a9jWP9HRTklcwCIjPVNUftfK326Ui2tdnW/0QcMZQ942tNl994xz1+caQJI3BKE83BbgJeKKqfm9o1z7g2BNK24E7hupXtKeczgNebJeM7gIuSHJKu2F9AXBX2/fdJOe1sa6Yday5xpAkjcEol5t+BXg38GiSh1vtd4GPArcluQr4JvCutm8/cDEwA3wfeA9AVR1O8mHggdbuQ1V1uG2/D7gZeBXwhfZinjEkSWOwYEhU1Z8A6ew+f472BVzdOdYeYM8c9QeBN89Rf26uMSRJ4+FfXEuSugwJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkrgVDIsmeJM8meWyo9sEkh5I83F4XD+37QJKZJF9LcuFQfarVZpLsGqqfmeT+Vv9ckhNb/ZXt/Uzbv2nZVi1JGskoZxI3A1Nz1K+vqrPbaz9Aki3AZcCbWp9PJtmQZAPwCeAiYAtweWsL8LF2rDcCzwNXtfpVwPOtfn1rJ0kaowVDoqq+BBwe8XjbgFur6odV9XVgBjinvWaq6qmq+hFwK7AtSYC3A7e3/nuBS4aOtbdt3w6c39pLksbkhCX0vSbJFcCDwM6qeh44HbhvqM3BVgN4elb9XOB1wAtVdXSO9qcf61NVR5O82Np/Z/ZEkuwAdgBMTEwwPT29qAUdOXJk0X3XKte8Prjm8dl51tGFG62QlVjzYkPiRuDDQLWvHwd+Y7km9VJV1W5gN8DWrVtrcnJyUceZnp5msX3XKte8Prjm8bly151jH/OYm6dOWvY1L+rppqr6dlX9uKr+L/ApBpeTAA4BZww13dhqvfpzwMlJTphV/4ljtf2vae0lSWOyqJBIctrQ23cAx5582gdc1p5MOhPYDHwZeADY3J5kOpHBze19VVXAvcClrf924I6hY21v25cCX2ztJUljsuDlpiSfBSaBU5McBK4FJpOczeBy0zeA3wKoqgNJbgMeB44CV1fVj9txrgHuAjYAe6rqQBvi/cCtST4CPATc1Oo3AZ9OMsPgxvllS12sJOmlWTAkquryOco3zVE71v464Lo56vuB/XPUn+JvLlcN138AvHOh+UmSVo5/cS1J6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUpchIUnqMiQkSV2GhCSpy5CQJHUZEpKkLkNCktS1YEgk2ZPk2SSPDdVem+TuJE+2r6e0epLckGQmySNJ3jrUZ3tr/2SS7UP1tyV5tPW5IUnmG0OSND6jnEncDEzNqu0C7qmqzcA97T3ARcDm9toB3AiDH/jAtcC5wDnAtUM/9G8E3jvUb2qBMSRJY7JgSFTVl4DDs8rbgL1tey9wyVD9lhq4Dzg5yWnAhcDdVXW4qp4H7gam2r5XV9V9VVXALbOONdcYkqQxWew9iYmqeqZtfwuYaNunA08PtTvYavPVD85Rn28MSdKYnLDUA1RVJanlmMxix0iyg8HlLSYmJpienl7UOEeOHFl037XKNa8Prnl8dp51dOxjHrMSa15sSHw7yWlV9Uy7ZPRsqx8Czhhqt7HVDgGTs+rTrb5xjvbzjfFTqmo3sBtg69atNTk52Ws6r+npaRbbd61yzeuDax6fK3fdOfYxj7l56qRlX/NiLzftA449obQduGOofkV7yuk84MV2yegu4IIkp7Qb1hcAd7V9301yXnuq6YpZx5prDEnSmCx4JpHkswzOAk5NcpDBU0ofBW5LchXwTeBdrfl+4GJgBvg+8B6Aqjqc5MPAA63dh6rq2M3w9zF4gupVwBfai3nGkCSNyYIhUVWXd3adP0fbAq7uHGcPsGeO+oPAm+eoPzfXGJKk8fEvriVJXYaEJKnLkJAkdRkSkqQuQ0KS1GVISJK6DAlJUteSP7tJko5Hjx56cVU/IuPlwjMJSVKXISFJ6jIkJEldhoQkqcuQkCR1GRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXYaEJKnLkJAkdRkSkqQuQ0KS1LWkkEjyjSSPJnk4yYOt9tokdyd5sn09pdWT5IYkM0keSfLWoeNsb+2fTLJ9qP62dvyZ1jdLma8k6aVZjjOJf1RVZ1fV1vZ+F3BPVW0G7mnvAS4CNrfXDuBGGIQKcC1wLnAOcO2xYGlt3jvUb2oZ5itJGtFKXG7aBuxt23uBS4bqt9TAfcDJSU4DLgTurqrDVfU8cDcw1fa9uqruq6oCbhk6liRpDJb6vy8t4H8kKeA/VdVuYKKqnmn7vwVMtO3TgaeH+h5stfnqB+eo/5QkOxicnTAxMcH09PSiFnPkyJFF912rXPP6sB7XPPEq2HnW0dWexlitxPd5qSHxD6rqUJJfAO5O8r+Gd1ZVtQBZUS2cdgNs3bq1JicnF3Wc6elpFtt3rXLN68N6XPN/+MwdfPzRpf6IW1tunjpp2b/PS7rcVFWH2tdngc8zuKfw7XapiPb12db8EHDGUPeNrTZffeMcdUnSmCw6JJKclOTnj20DFwCPAfuAY08obQfuaNv7gCvaU07nAS+2y1J3ARckOaXdsL4AuKvt+26S89pTTVcMHUuSNAZLORebAD7fnko9AfiDqvrvSR4AbktyFfBN4F2t/X7gYmAG+D7wHoCqOpzkw8ADrd2Hqupw234fcDPwKuAL7SVJGpNFh0RVPQX88hz154Dz56gXcHXnWHuAPXPUHwTevNg5SpKWZn3d1ZE0Vpt23blqY+88a9WGflnxYzkkSV2GhCSpy5CQJHUZEpKkLkNCktRlSEiSugwJSVKXfychrQOPHnqRK1fxbxa0dnkmIUnqMiQkSV2GhCSpy3sS0pj4OUZaizyTkCR1eSahdccnfaTRGRJaFV56kdYGQ2LIav6G+Y2P/pNVGdffqiXNx5A4TqzWb9b+Vi1pPt64liR1GRKSpC5DQpLUZUhIkroMCUlS13EfEkmmknwtyUySXas9H0laT47rkEiyAfgEcBGwBbg8yZbVnZUkrR/HdUgA5wAzVfVUVf0IuBXYtspzkqR1I1W12nPoSnIpMFVVv9nevxs4t6qumdVuB7Cjvf07wNcWOeSpwHcW2Xetcs3rg2teH5ay5jdU1etnF18Wf3FdVbuB3Us9TpIHq2rrMkxpzXDN64NrXh9WYs3H++WmQ8AZQ+83tpokaQyO95B4ANic5MwkJwKXAftWeU6StG4c15ebqupokmuAu4ANwJ6qOrCCQy75ktUa5JrXB9e8Piz7mo/rG9eSpNV1vF9ukiStIkNCktS1LkNioY/6SHJlkr9K8nB7/eZqzHM5jfLxJkneleTxJAeS/MG457jcRvg+Xz/0Pf7zJC+swjSX1Qhr/ltJ7k3yUJJHkly8GvNcTiOs+Q1J7mnrnU6ycTXmuVyS7EnybJLHOvuT5Ib2z+ORJG9d0oBVta5eDG6A/wXwt4ETga8CW2a1uRL4j6s91zGveTPwEHBKe/8Lqz3vlV7zrPb/gsGDEas+9xX+Pu8G/nnb3gJ8Y7XnPYY1/xdge9t+O/Dp1Z73Etf8D4G3Ao919l8MfAEIcB5w/1LGW49nEuvxoz5GWfN7gU9U1fMAVfXsmOe43F7q9/ly4LNjmdnKGWXNBby6bb8G+Msxzm8ljLLmLcAX2/a9c+xfU6rqS8DheZpsA26pgfuAk5Octtjx1mNInA48PfT+YKvN9k/bqdrtSc6YY/9aMsqafxH4xST/M8l9SabGNruVMer3mSRvAM7kb36QrFWjrPmDwD9LchDYz+AMai0bZc1fBX69bb8D+PkkrxvD3FbLyP/uj2I9hsQo/huwqar+HnA3sHeV5zMOJzC45DTJ4LfqTyU5eTUnNEaXAbdX1Y9XeyJjcDlwc1VtZHBZ4tNJXu4/B/418KtJHgJ+lcGnNqyH7/WyeLn/yzGXBT/qo6qeq6oftrf/GXjbmOa2Ukb5eJODwL6q+j9V9XXgzxmExlr1Uj7S5TLW/qUmGG3NVwG3AVTVnwI/y+BD4daqUf57/suq+vWqegvwb1vthbHNcPyW9eOM1mNILPhRH7Ou3/0a8MQY57cSRvl4k//K4CyCJKcyuPz01BjnuNxG+kiXJL8EnAL86ZjntxJGWfP/Bs4HSPJ3GYTEX411lstrlP+eTx06W/oAsGfMcxy3fcAV7Smn84AXq+qZxR7suP5YjpVQnY/6SPIh4MGq2gf8yyS/BhxlcIPoylWb8DIYcc13ARckeZzBqfi/qarnVm/WSzPimmHwQ+XWao+FrGUjrnkng0uJ/4rBTewr1/LaR1zzJPDvkxTwJeDqVZvwMkjyWQZrOrXdW7oWeAVAVf0+g3tNFwMzwPeB9yxpvDX874ckaYWtx8tNkqQRGRKSpC5DQpLUZUhIkroMCUlSlyEhSeoyJCRJXf8Ppx/KUOUF4zMAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "res_df['tox_diff'] = np.abs(res_df.ref_tox - res_df.trn_tox)\n",
+ "res_df.tox_diff.hist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file