Skip to content

Commit

Permalink
Add truncation=True to prepare_for_model function
Browse files Browse the repository at this point in the history
  • Loading branch information
nreimers committed Jul 9, 2020
1 parent 8ff06a7 commit c6f8c54
Show file tree
Hide file tree
Showing 10 changed files with 10 additions and 11 deletions.
2 changes: 1 addition & 1 deletion sentence_transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.2.6"
__version__ = "0.2.7"
__DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/'
from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset
from .data_samplers import LabelSampler
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/models/ALBERT.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
:return: embedding ids, segment ids and mask for the sentence
"""
pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)


def get_config_dict(self):
Expand Down
3 changes: 1 addition & 2 deletions sentence_transformers/models/BERT.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_
self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)



def forward(self, features):
"""Returns token_embeddings, cls_token"""
output_states = self.bert(**features)
Expand Down Expand Up @@ -62,7 +61,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
"""
pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token

return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)


def get_config_dict(self):
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/models/CamemBERT.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
:return: embedding ids, segment ids and mask for the sentence
"""
pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)

def get_config_dict(self):
return {key: self.__dict__[key] for key in self.config_keys}
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/models/DistilBERT.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
:return: embedding ids, segment ids and mask for the sentence
"""
pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 #Add space for special tokens
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)

def get_config_dict(self):
return {key: self.__dict__[key] for key in self.config_keys}
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/models/RoBERTa.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
:return: embedding ids, segment ids and mask for the sentence
"""
pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)

def get_config_dict(self):
return {key: self.__dict__[key] for key in self.config_keys}
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/models/T5.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
"""

pad_seq_length = min(pad_seq_length, self.max_seq_length)
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)

def get_config_dict(self):
return {key: self.__dict__[key] for key in self.config_keys}
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/models/Transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
:return: embedding ids, segment ids and mask for the sentence
"""
pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)

def get_config_dict(self):
return {key: self.__dict__[key] for key in self.config_keys}
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/models/XLMRoBERTa.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
:return: embedding ids, segment ids and mask for the sentence
"""
pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 #Add space for special tokens
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)

def get_config_dict(self):
return {key: self.__dict__[key] for key in self.config_keys}
Expand Down
2 changes: 1 addition & 1 deletion sentence_transformers/models/XLNet.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_sentence_features(self, tokens: List[int], pad_seq_length: int) -> Dict[
:return: embedding ids, segment ids and mask for the sentence
"""
pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt', truncation=True)

def get_config_dict(self):
return {key: self.__dict__[key] for key in self.config_keys}
Expand Down

0 comments on commit c6f8c54

Please sign in to comment.