forked from 4N3MONE/nlp_tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentence_sim.py
34 lines (23 loc) · 1.4 KB
/
sentence_sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import torch
from transformers import AutoModel, AutoTokenizer
class SentenceSim:
SCALE_FACTOR = 100
def __init__(self, model_path='BM-K/KoDiffCSE-RoBERTa'):
self.device = 'cuda' if torch.cuda.is_available else 'cpu'
self.model = AutoModel.from_pretrained(model_path).to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
def _get_embeddings(self, sentence: str):
input_ids = self.tokenizer(sentence, padding=True, truncation=True, return_tensors='pt').to(self.device)
outputs = self.model(**input_ids, return_dict=False)[0] # outputs shape is (batch_size, seq_length, hidden_dim)
embeddings = outputs.mean(dim=1) # 평균 임베딩 계산 (batch_size, hidden_dim)
return embeddings
def _cosine_similarity(self, emb1, emb2) :
if len(emb1.shape) == 1: emb1 = emb1.unsqueeze(0)
if len(emb2.shape) == 1: emb2 = emb2.unsqueeze(0)
emb1_norm = emb1 / emb1.norm(dim=1)[:, None]
emb2_norm = emb2 / emb2.norm(dim=1)[:, None]
return (emb1_norm @ emb2_norm.transpose(0, 1)).item() * self.SCALE_FACTOR
def cal_score(self,label_sentence:str,compare_sentence:str) ->float :
label_embeddings = self._get_embeddings(label_sentence)
compare_embeddings = self._get_embeddings(compare_sentence)
return round(self._cosine_similarity(label_embeddings, compare_embeddings),3)