From da30f934d8c5625636004c386298d8e18e7d8448 Mon Sep 17 00:00:00 2001 From: Hossam Hagag <90828745+hh-space-invader@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:59:52 +0200 Subject: [PATCH] fix: Fix colbert model shape mismatch (#413) * fix: Fix colbert model shape mismatch * refactor: Added the truncation after tokenizer init --- fastembed/late_interaction/colbert.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index 4d65fc29..97c38570 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -191,6 +191,9 @@ def load_onnx_model(self) -> None: self.tokenizer.encode(symbol, add_special_tokens=False).ids[0] for symbol in string.punctuation } + current_max_length = self.tokenizer.truncation["max_length"] + # ensure not to overflow after adding document-marker + self.tokenizer.enable_truncation(max_length=current_max_length - 1) def embed( self,