From c6f8c542378dc979a55355bd562c8f1401e62f14 Mon Sep 17 00:00:00 2001 From: Nils Reimers Date: Thu, 9 Jul 2020 16:44:12 +0200 Subject: [PATCH] Add truncation=True to prepare_for_model function --- sentence_transformers/__init__.py | 2 +- sentence_transformers/models/ALBERT.py | 2 +- sentence_transformers/models/BERT.py | 3 +-- sentence_transformers/models/CamemBERT.py | 2 +- sentence_transformers/models/DistilBERT.py | 2 +- sentence_transformers/models/RoBERTa.py | 2 +- sentence_transformers/models/T5.py | 2 +- sentence_transformers/models/Transformer.py | 2 +- sentence_transformers/models/XLMRoBERTa.py | 2 +- sentence_transformers/models/XLNet.py | 2 +- 10 files changed, 10 insertions(+), 11 deletions(-) diff --git a/sentence_transformers/__init__.py b/sentence_transformers/__init__.py index 9a42cc2ed..506c17c0b 100644 --- a/sentence_transformers/__init__.py +++ b/sentence_transformers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.2.6" +__version__ = "0.2.7" __DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/' from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset from .data_samplers import LabelSampler diff --git a/sentence_transformers/models/ALBERT.py b/sentence_transformers/models/ALBERT.py index 53ac31715..dd6812b3a 100644 --- a/sentence_transformers/models/ALBERT.py +++ b/sentence_transformers/models/ALBERT.py @@ -61,7 +61,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int): :return: embedding ids, segment ids and mask for the sentence """ pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): diff --git a/sentence_transformers/models/BERT.py b/sentence_transformers/models/BERT.py index 8d7eb115c..e50284c22 100644 --- a/sentence_transformers/models/BERT.py +++ b/sentence_transformers/models/BERT.py @@ -28,7 +28,6 @@ def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_ self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) - def forward(self, features): """Returns token_embeddings, cls_token""" output_states = self.bert(**features) @@ -62,7 +61,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int): """ pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): diff --git a/sentence_transformers/models/CamemBERT.py b/sentence_transformers/models/CamemBERT.py index dc6d2e006..daa28d749 100644 --- a/sentence_transformers/models/CamemBERT.py +++ b/sentence_transformers/models/CamemBERT.py @@ -63,7 +63,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int): :return: embedding ids, segment ids and mask for the sentence """ pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys} diff --git a/sentence_transformers/models/DistilBERT.py b/sentence_transformers/models/DistilBERT.py index f6e86baf0..abdbcc0c5 100644 --- a/sentence_transformers/models/DistilBERT.py +++ b/sentence_transformers/models/DistilBERT.py @@ -62,7 +62,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int): :return: embedding ids, segment ids and mask for the sentence """ pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 #Add space for special tokens - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys} diff --git a/sentence_transformers/models/RoBERTa.py b/sentence_transformers/models/RoBERTa.py index 648c3398a..12adcf2cf 100644 --- a/sentence_transformers/models/RoBERTa.py +++ b/sentence_transformers/models/RoBERTa.py @@ -61,7 +61,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int): :return: embedding ids, segment ids and mask for the sentence """ pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys} diff --git a/sentence_transformers/models/T5.py b/sentence_transformers/models/T5.py index 60a980186..1980d7090 100644 --- a/sentence_transformers/models/T5.py +++ b/sentence_transformers/models/T5.py @@ -61,7 +61,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int): """ pad_seq_length = min(pad_seq_length, self.max_seq_length) - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys} diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py index 1865cfd02..e13485658 100644 --- a/sentence_transformers/models/Transformer.py +++ b/sentence_transformers/models/Transformer.py @@ -58,7 +58,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int): :return: embedding ids, segment ids and mask for the sentence """ pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys} diff --git a/sentence_transformers/models/XLMRoBERTa.py b/sentence_transformers/models/XLMRoBERTa.py index f4e05472f..88f527c9d 100644 --- a/sentence_transformers/models/XLMRoBERTa.py +++ b/sentence_transformers/models/XLMRoBERTa.py @@ -63,7 +63,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int): :return: embedding ids, segment ids and mask for the sentence """ pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 #Add space for special tokens - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys} diff --git a/sentence_transformers/models/XLNet.py b/sentence_transformers/models/XLNet.py index a1c0ad9e1..ae16b64d5 100644 --- a/sentence_transformers/models/XLNet.py +++ b/sentence_transformers/models/XLNet.py @@ -58,7 +58,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int) -> Dict[ :return: embedding ids, segment ids and mask for the sentence """ pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens - return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') + return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True) def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys}