Robert model

nducthang · nducthang · commit df9718bf1c50 · 2020-10-27T12:08:23.000+07:00
diff --git a/RobertaRun.py b/RobertaRun.py
@@ -0,0 +1,38 @@
+from sklearn.model_selection import StratifiedKFold
+import pandas as pd
+from RobertaTweetModel import RobertaTweetModel
+import torch.optim as optim
+from utils.loss import loss
+from utils.roberta_get_train_val_loaders import roberta_get_train_val_loaders
+from RobertaTrainModel import roberta_train_model
+import torch
+
+num_epochs = 3
+batch_size = 1
+seed_value = 28091997
+
+torch.cuda.manual_seed(seed_value)
+torch.cuda.manual_seed_all(seed_value)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = True
+skl = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed_value)
+
+train_df = pd.read_csv('./data/train.csv')
+train_df['text'] = train_df['text'].astype(str)
+train_df['selected_text'] = train_df['selected_text'].astype(str)
+
+for fold, (train_idx, val_idx) in enumerate(skl.split(train_df, train_df.sentiment), start=1):
+    print("========== Fold {} ========== ".format(fold))
+    model = RobertaTweetModel()
+    optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
+    loss = loss
+    dataloader_dict = roberta_get_train_val_loaders(
+        train_df, train_idx, val_idx, batch_size)
+    roberta_train_model(
+        model,
+        dataloader_dict,
+        loss,
+        optimizer,
+        num_epochs,
+        f'./weights/roberta/roberta_fold_{fold}.bin'
+    )
diff --git a/RobertaTrainModel.py b/RobertaTrainModel.py
@@ -0,0 +1,74 @@
+import torch
+from utils.compute_jaccard_score import compute_jaccard_score
+import tqdm
+
+
+def roberta_train_model(model, dataloaders_dict, loss, optimizer, num_epochs, filename):
+    model.cuda()
+
+    for epoch in range(num_epochs):
+        # Mỗi epoch sẽ thực hiện 2 phase
+        for phase in ['train', 'val']:
+            # Nếu phase train thì huấn luyện, phase val thì tính loss và jaccard
+            if phase == 'train':
+                model.train()
+            else:
+                model.eval()
+
+            # Khởi tạo loss và jaccard
+            epoch_loss = 0.0
+            epoch_jaccard = 0.0
+
+            for data in tqdm.tqdm((dataloaders_dict[phase])):
+                # Lấy thông tin dữ liệu
+                ids = data['ids'].cuda()
+                masks = data['masks'].cuda()
+                tweet = data['tweet']
+                offsets = data['offsets'].numpy()
+                start_idx = data['start_idx'].cuda()
+                end_idx = data['end_idx'].cuda()
+
+                # Reset tích lũy đạo hàm
+                optimizer.zero_grad()
+
+                with torch.set_grad_enabled(phase == 'train'):
+                    start_logits, end_logits = model(ids, masks)
+                    loss_value = loss(
+                        start_logits, end_logits, start_idx, end_idx)
+
+                    # nếu là phase train thì thực hiện lan truyền ngược
+                    # và cập nhật tham số
+                    if phase == 'train':
+                        loss_value.backward()
+                        optimizer.step()
+
+                    epoch_loss += loss_value.item() * len(ids)
+
+                    start_idx = start_idx.cpu().detach().numpy()
+                    end_idx = end_idx.cpu().detach().numpy()
+
+                    start_logits = torch.softmax(
+                        start_logits, dim=1).cpu().detach().numpy()
+                    end_logits = torch.softmax(
+                        end_logits, dim=1).cpu().detach().numpy()
+
+                    # Tính toán jaccard cho tất cả các câu
+                    for i in range(len(ids)):
+                        jaccard_score = compute_jaccard_score(
+                            tweet[i],
+                            start_idx[i],
+                            end_idx[i],
+                            start_logits[i],
+                            end_logits[i],
+                            offsets[i]
+                        )
+                        epoch_jaccard += jaccard_score
+
+            # Trung bình loss và jaccard
+            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
+            epoch_jaccard = epoch_jaccard / \
+                len(dataloaders_dict[phase].dataset)
+
+            print("Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}".format(epoch +
+                                                                                1, num_epochs, phase, epoch_loss, epoch_jaccard))
+    torch.save(model.state_dict(), filename)
diff --git a/RobertaTweetDataset.py b/RobertaTweetDataset.py
@@ -0,0 +1,114 @@
+import torch
+import tokenizers
+import pandas as pd
+
+class RobertaTweetDataset(torch.utils.data.Dataset):
+    def __init__(self, df, max_len=128):
+        # Dataframe dữ liệu
+        self.df = df
+        # độ dài tối đa của câu
+        self.max_len = max_len
+        # Nhãn
+        self.labeled = 'selected_text' in df
+        # Khởi tạo mã hóa BPE
+        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
+            vocab_file='./roberta.base.torch/vocab.json',
+            merges_file='./roberta.base.torch/merges.txt',
+            lowercase=True,
+            add_prefix_space=True
+        )
+
+    def __len__(self):
+        """ Trả về độ dài của DataFrame """
+        return len(self.df)
+
+    def get_input_data(self, row):
+        """
+        Tạo sample input cho 1 dòng dữ liệu
+        - Input: <s><sentiment></s></s>token11 token12 ... </s><pad><pad>
+
+        """
+        # Thêm khoảng trắng vào đầu câu đầu vào
+        tweet = " " + " ".join(row.text.lower().split())
+        # Mã hóa BPE cho câu đầu vào
+        encoding = self.tokenizer.encode(tweet)
+        # {'positive': 1313, 'negative': 2430, 'neutral': 7974}
+        sentiment_id = self.tokenizer.encode(row.sentiment).ids
+        # 0 là đại diện cho token <s> và 2 là token </s>, 1 là token <pad>
+        # Mã hóa câu đầu vào sang số
+        ids = [0] + sentiment_id + [2, 2] + encoding.ids + [2]
+        # offset là vị trí các token của câu ban đầu
+        # Ví dụ (0,2) (2,3) (3,4) (4,9) ...
+        offsets = [(0, 0)] * 4 + encoding.offsets + [(0, 0)]
+
+        # Thêm các token pad cho viền câu
+        pad_len = self.max_len - len(ids)
+        if pad_len > 0:
+            ids += [1] * pad_len
+            offsets += [(0,0)] * pad_len
+
+        ids = torch.tensor(ids)
+        # Tạo mặt nạ, đánh dấu 1 cho toàn bộ câu đầu vào
+        # Trừ các phần là <pad>
+        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
+        offsets = torch.tensor(offsets)
+
+        return ids, masks, tweet, offsets
+
+    def get_target_idx(self, row, tweet, offsets):
+        selected_text = " " + " ".join(row.selected_text.lower().split())
+
+        len_st = len(selected_text) - 1
+        # Vị trí bắt đầu và kết thúc của selectec_text trong tweet
+        idx0, idx1 = None, None
+
+        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
+            if " " + tweet[ind:ind+len_st] == selected_text:
+                idx0 = ind
+                idx1 = ind + len_st - 1
+
+        # Đánh dấu những vị trí mà có ký tự của selected_text là 1
+        char_targets = [0] * len(tweet)
+        if idx0 != None and idx1 != None:
+            for ct in range(idx0, idx1 + 1):
+                char_targets[ct] = 1
+
+        # Đánh dấu những token chứa selected_text
+        target_idx = []
+        for j, (offset1, offset2) in enumerate(offsets):
+            if sum(char_targets[offset1:offset2]) > 0:
+                target_idx.append(j)
+
+        # Token bắt đầu và token kết thúc của selected_text
+        start_idx = target_idx[0]
+        end_idx = target_idx[-1]
+
+        return start_idx, end_idx
+
+    def __getitem__(self, index):
+        """
+        Chuyển đổi hàng dữ liệu thứ index trong dataFrame
+        sang dữ liệu đầu vào của mô hình
+        Các thuộc tính cho dữ liệu đầu vafo:
+        - ids
+        - masks
+        - tweet
+        - offsets
+        - start_idx
+        - end_idx
+        """
+        data = {}
+        row = self.df.iloc[index]
+
+        ids, masks, tweet, offsets = self.get_input_data(row)
+        data['ids'] = ids
+        data['masks'] = masks
+        data['tweet'] = tweet
+        data['offsets'] = offsets
+
+        if self.labeled:
+            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
+            data['start_idx'] = start_idx
+            data['end_idx'] = end_idx
+
+        return data
diff --git a/RobertaTweetModel.py b/RobertaTweetModel.py
@@ -0,0 +1,43 @@
+from torch import nn
+import torch
+from transformers import RobertaConfig, RobertaModel
+
+
+class RobertaTweetModel(nn.Module):
+    def __init__(self):
+        super(RobertaTweetModel, self).__init__()
+        config = RobertaConfig.from_pretrained(
+            './roberta.base.torch/config.json',
+            output_hidden_states=True
+        )
+        self.roberta = RobertaModel.from_pretrained(
+            './roberta.base.torch/pytorch_model.bin',
+            config=config
+        )
+
+        self.dropout = nn.Dropout(0.5)
+        self.fc = nn.Linear(config.hidden_size, 2)
+        nn.init.normal_(self.fc.weight, std=0.2)
+        nn.init.normal_(self.fc.bias, 0)
+
+    def forward(self, input_ids, attention_mask):
+        # Đầu vào Roberta cần chỉ số các token (input_ids)
+        # Và attention_mask (Mặt nạ biểu diễn câu 0 = pad, 1 = otherwise)
+        _, _, hs = self.roberta(input_ids, attention_mask)
+
+        # len(hs) = 13 tensor, mỗi tensor shape là (1, 128, 768)
+        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])
+        # x shape (4,1,128,768)
+        x = torch.mean(x, 0)
+        # x shape (1,128,768)
+        x = self.dropout(x)
+        x = self.fc(x)
+        # x shape (1,128,2)
+        start_logits, end_logits = x.split(1, dim=-1)
+
+        # Nếu số chiều cuối là 1 thì bỏ đi (1,128,1) -> (1,128)
+        # Ví dụ (AxBxCX1) --> size (AxBxC)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        return start_logits, end_logits