Merge branch 'encoder' into main

adeecc · adeecc · commit e1b1b630efa0 · 2021-12-05T19:19:59.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,9 @@ venv.bak/
 # seed project
 lightning_logs/
 .DS_Store
+
+
+# WandB stuff
+ParaPhrasegen/
+wandb/
+*.ckpt
diff --git a/eval.py b/eval.py
@@ -0,0 +1,86 @@
+from typing import List
+
+import torch
+
+from transformers import AutoTokenizer
+
+
+from paraphrasegen.constants import PATH_BASE_MODELS
+from paraphrasegen.model import Encoder
+from paraphrasegen.loss import Similarity
+
+
+device = (
+    "cpu"  # torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+)
+
+
+def tokenize_text(model_name, sentences: List[str]):
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name, use_fast=True, cache_dir=PATH_BASE_MODELS
+    )
+
+    tokenized = tokenizer(
+        sentences,
+        max_length=32,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    return tokenized
+
+
+def eval(encoder):
+    anchor = "A Washington County man may have the countys first human case of West Nile virus , the health department said Friday ."
+    target = "A Hyderabadi man may have the citys first human case of West Nile virus , the health ministry said Friday ."
+    # target = "The countys first and only human case of West Nile this year was confirmed by health officials on Sept . 8 ."
+    negative = "What the fuck is the County Virus"
+
+    print("Tokenizing Text... ", sep="")
+    tokenized = tokenize_text(
+        encoder.hparams.model_name_or_path, [anchor, target, negative]
+    )
+
+    print("Tokenized!")
+
+    print("Generating Embeddings... ", sep="")
+    embeddings = encoder(
+        tokenized["input_ids"],
+        tokenized["attention_mask"],
+        do_mlm=False,
+    )
+
+    anchor_embedddings = embeddings[0, ...]
+    target_embedddings = embeddings[1, ...]
+    negative_embeddings = embeddings[2, ...]
+
+    print("Generated!")
+
+    # print(f"|Anchor|: {torch.norm(anchor_embedddings)}")
+    diff = target_embedddings - anchor_embedddings
+    print(
+        f"|target_embedddings - anchor_embedddings|: {torch.norm(diff)}, %age: {100 * torch.mean(diff / anchor_embedddings)}"
+    )
+
+    diff = negative_embeddings - anchor_embedddings
+    print(
+        f"|negative_embeddings - anchor_embedddings|: {torch.norm(diff)}, %age: {100 * torch.mean(diff / anchor_embedddings)}"
+    )
+
+    sim = Similarity(temp=1)
+    print(
+        f"Similarity between anchor and target: {sim(anchor_embedddings, target_embedddings)}"
+    )
+
+    print(
+        f"Similarity between anchor and negative: {sim(anchor_embedddings, negative_embeddings)}"
+    )
+
+
+if __name__ == "__main__":
+    path_to_checkpoint = "runs/default/version_7/checkpoints/last.ckpt"  # input(">>> Enter Model Checkpoint Path: ")
+    print("Loading Model... ", sep="")
+    encoder = Encoder.load_from_checkpoint(path_to_checkpoint)
+    print("Finished")
+
+    eval(encoder)
diff --git a/paraphrasegen/constants.py b/paraphrasegen/constants.py
@@ -1,7 +1,7 @@
 import os
 import torch
 
-EPOCHS = 3
+MAX_EPOCHS = 5
 PATH_DATASETS = os.environ.get("PATH_DATASETS", "./datasets")
 PATH_BASE_MODELS = os.environ.get("PATH_BASE_MODELS", "./base_models")
 AVAIL_GPUS = min(1, torch.cuda.device_count())
diff --git a/paraphrasegen/dataset.py b/paraphrasegen/dataset.py
@@ -65,20 +65,22 @@ def prepare_data(self) -> None:
     def setup(self, stage: Optional[str] = None) -> None:
         self.dataset = load_dataset(*self.dataset_args, cache_dir=PATH_DATASETS)
 
-        if self.task_name == "qqp":
-            self.dataset["train"] = self.dataset["train"].filter(
-                lambda el: el["label"] == 1
-            )
-            self.dataset["validation"] = self.dataset["validation"].filter(
-                lambda el: el["label"] == 1
-            )
-
-        else:
-            self.dataset = self.dataset.filter(lambda el: el["label"] == 1)
+        self.dataset["train"] = self.dataset["train"].filter(
+            lambda el: el["label"] == 1
+        )
+        # self.dataset["validation"] = self.dataset["validation"].filter(
+        #     lambda el: el["label"] == 1
+        # )
+
         self.dataset = self.dataset.map(
             self.convert_to_features,
             batched=True,
-            remove_columns=(["label",] + self.text_fields),
+            remove_columns=(
+                [
+                    "label",
+                ]
+                + self.text_fields
+            ),
             num_proc=NUM_WORKERS,
         )
 
diff --git a/paraphrasegen/model.py b/paraphrasegen/model.py
@@ -1,5 +1,5 @@
 from os import stat_result
-from typing import Optional
+from typing import List, Optional
 import time
 
 import torch
@@ -12,7 +12,7 @@
 
 from transformers import AutoConfig, AutoModel, AdamW
 
-from paraphrasegen.loss import ContrastiveLoss
+from paraphrasegen.loss import ContrastiveLoss, Similarity
 from paraphrasegen.constants import (
     AVAIL_GPUS,
     BATCH_SIZE,
@@ -72,25 +72,48 @@ def forward(self, attention_mask, outputs):
 
 
 class MLPLayer(nn.Module):
-    def __init__(self, in_dims: int = 768, hidden_dims: int = 768):
+    def __init__(
+        self, in_dims: int = 768, hidden_dims: List[int] = 768, activation: str = "GELU"
+    ):
         super(MLPLayer, self).__init__()
-        self.fc1 = nn.Linear(in_dims, hidden_dims)
-        self.layer_norm = nn.LayerNorm(hidden_dims)
-        self.activation = nn.Tanh()
+
+        if activation == "GELU":
+            activation_fn = nn.GELU()
+        elif activation == "ReLU":
+            activation_fn = nn.ReLU()
+        elif activation == "mish":
+            activation_fn = nn.Mish()
+        elif activation == "leaky_relu":
+            activation_fn = nn.LeakyReLU()
+
+        layers = [
+            nn.Linear(in_dims, hidden_dims[0]),
+            nn.LayerNorm(hidden_dims[0]),
+            activation_fn,
+        ]
+
+        for i in range(1, len(hidden_dims)):
+            layers += [
+                nn.Linear(hidden_dims[i - 1], hidden_dims[i]),
+                nn.LayerNorm(hidden_dims[i]),
+                activation_fn,
+            ]
+
+        self.net = nn.Sequential(*layers)
 
     def forward(self, x: torch.Tensor):
-        out = self.fc1(x)
-        out = self.layer_norm(out)
-        return self.activation(out)
+        return self.net(x)
 
 
 class Encoder(pl.LightningModule):
     def __init__(
         self,
         model_name_or_path: str,
         input_mask_rate: float = 0.1,
-        embedding_from: str = "single",
         pooler_type: str = "cls",
+        mlp_layers: List[int] = [768],
+        temp: float = 0.05,
+        hard_negative_weight: float = 0,
         learning_rate: float = 3e-5,
         weight_decay: float = 0,
     ) -> None:
@@ -99,17 +122,19 @@ def __init__(
         self.save_hyperparameters()
         self.config = AutoConfig.from_pretrained(model_name_or_path)
         self.input_mask_rate = input_mask_rate
-        self.embedding_from = embedding_from
         self.bert_model = AutoModel.from_pretrained(
             model_name_or_path, config=self.config, cache_dir=PATH_BASE_MODELS
         )
 
         self.pooler_type = pooler_type
         self.pooler = Pooler(pooler_type)
 
-        self.net = MLPLayer()
+        self.net = MLPLayer(in_dims=768, hidden_dims=mlp_layers)
 
-        self.loss_fn = ContrastiveLoss()
+        self.loss_fn = ContrastiveLoss(
+            temp=self.hparams.temp,
+            hard_negative_weight=self.hparams.hard_negative_weight,
+        )
 
     def forward(
         self, input_ids: torch.Tensor, attention_mask: torch.Tensor, do_mlm: bool = True
@@ -167,42 +192,59 @@ def training_step(self, batch, batch_idx):
             attention_mask=batch["target_attention_mask"],
         )
 
-        loss = self.loss_fn(anchor_outputs, target_outputs)
+        negative_index = torch.randperm(batch["anchor_input_ids"].size(0))
+
+        negative_outputs = self(
+            input_ids=batch["anchor_input_ids"][negative_index],
+            attention_mask=batch["anchor_attention_mask"][negative_index],
+        )
+
+        loss = self.loss_fn(anchor_outputs, target_outputs, negative_outputs)
         self.log("loss/train", loss)
 
         return loss
 
-    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+    def _evaluate(self, batch):
         anchor_outputs = self(
             input_ids=batch["anchor_input_ids"],
             attention_mask=batch["anchor_attention_mask"],
+            do_mlm=False,
         )
 
         target_outputs = self(
             input_ids=batch["target_input_ids"],
             attention_mask=batch["target_attention_mask"],
+            do_mlm=False,
         )
 
-        loss = self.loss_fn(anchor_outputs, target_outputs)
+        pos_anchor_emb = anchor_outputs[batch["labels"] == 1]
+        pos_target_emb = target_outputs[batch["labels"] == 1]
 
-        self.log("loss/val", loss, prog_bar=True)
-        self.log("hp_metric", loss)
+        neg_anchor_emb = anchor_outputs[batch["labels"] == 0]
+        neg_target_emb = target_outputs[batch["labels"] == 0]
 
-    def test_step(self, batch, batch_idx):
-        anchor_outputs = self(
-            input_ids=batch["anchor_input_ids"],
-            attention_mask=batch["anchor_attention_mask"],
-        )
+        pos_diff = torch.norm(pos_anchor_emb - pos_target_emb).mean()
+        neg_diff = torch.norm(neg_anchor_emb - neg_target_emb).mean()
 
-        target_outputs = self(
-            input_ids=batch["target_input_ids"],
-            attention_mask=batch["target_attention_mask"],
+        sim = Similarity(temp=self.hparams.temp)
+        pos_sim = sim(pos_anchor_emb, pos_target_emb).mean()
+        neg_sim = sim(neg_anchor_emb, neg_target_emb).mean()
+
+        self.log_dict(
+            {
+                "diff/pos": pos_diff,
+                "diff/neg": neg_diff,
+                "sim/pos": pos_sim,
+                "sim/neg": neg_sim,
+            }
         )
+        self.log("hp_metric", pos_sim - neg_sim)
 
-        loss = self.loss_fn(anchor_outputs, target_outputs)
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        self._evaluate(batch)
 
-        self.log("loss/test", loss, prog_bar=True)
-        self.log("hp_metric", loss)
+    def test_step(self, batch, batch_idx):
+        self._evaluate(batch)
 
     def configure_optimizers(self):
         """Prepare optimizer and schedule (linear warmup and decay)"""
@@ -256,7 +298,7 @@ def configure_optimizers(self):
     trainer = Trainer(
         max_epochs=1,
         gpus=AVAIL_GPUS,
-        log_every_n_steps=10,
+        log_every_n_steps=2,
         precision=16,
         stochastic_weight_avg=True,
         logger=TensorBoardLogger("runs/"),
diff --git a/run_exp.sh b/run_exp.sh
@@ -0,0 +1,10 @@
+#! /bin/sh
+
+
+for p in "cls" "avg"; do
+    for imr in "0.1" "0.15" "0.2" "0.25"; do
+        python train.py \
+        --pooler $p \
+        -i $imr
+    done
+done
diff --git a/train.py b/train.py