From e188ad73760b194d6b546c9025b6b2063cfff995 Mon Sep 17 00:00:00 2001 From: Artur Paniukov <chgk1101@gmail.com> Date: Mon, 23 Dec 2024 19:02:13 +0000 Subject: [PATCH] Fix Issue With Added Tokens --- python/openvino_tokenizers/constants.py | 2 +- python/openvino_tokenizers/hf_parser.py | 16 +++++++++++----- python/openvino_tokenizers/tokenizer_pipeline.py | 2 +- python/openvino_tokenizers/utils.py | 2 +- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/python/openvino_tokenizers/constants.py b/python/openvino_tokenizers/constants.py index e793ae51..2250c002 100644 --- a/python/openvino_tokenizers/constants.py +++ b/python/openvino_tokenizers/constants.py @@ -43,7 +43,7 @@ class UTF8ReplaceMode(Enum): def __str__(self): return self.value - + def __eq__(self, other): if isinstance(other, (UTF8ReplaceMode)): # UTF8ReplaceMode is a singleton, so we can compare them by reference diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py index 8a851c85..76760a20 100644 --- a/python/openvino_tokenizers/hf_parser.py +++ b/python/openvino_tokenizers/hf_parser.py @@ -276,7 +276,6 @@ def tokenization_model(self) -> None: "TemplateProcessing": CombineSegmentsStep.from_hf_json_template_postprocessor, "BertProcessing": CombineSegmentsStep.from_hf_json_bert_postprocessor, "RobertaProcessing": CombineSegmentsStep.from_hf_json_roberta_processor, - "ByteLevel": lambda *args: list(), # return no handle for ByteLevel so add_steps skips it } def post_tokenization(self) -> None: @@ -297,17 +296,22 @@ def post_tokenization(self) -> None: if pt_type == "Sequence": processors = post_processor_json["processors"] + byte_level = next( + ([] for step in processors if (step["type"] == "ByteLevel")), + None, + ) combine_segments_step = next( ( - self.post_tokenization_map[step["type"]](step, self.number_of_inputs, self.add_special_tokens) + step_class(step, self.number_of_inputs, self.add_special_tokens) for step in processors - if step["type"] in self.post_tokenization_map + if (step_class := self.post_tokenization_map.get(step["type"])) ), None, ) + combine_segments_step = combine_segments_step or byte_level if combine_segments_step is None: raise OVTypeError( - "Expected that Sequence post-tokenizer type contains one of supported post-tokenizers type:" + "Expected that Sequence post-tokenizer type contains one of supported post-tokenizers type: " f"{list(self.post_tokenization_map)}" ) else: @@ -376,7 +380,9 @@ def decoding(self) -> None: return skip_tokens = parse_special_tokens(self.original_tokenizer) - self.pipeline.add_steps(VocabDecoderStep(skip_tokens=list(skip_tokens), do_skip_tokens=self.skip_special_tokens)) + self.pipeline.add_steps( + VocabDecoderStep(skip_tokens=list(skip_tokens), do_skip_tokens=self.skip_special_tokens) + ) if self.tokenizer_json["decoder"]["type"] == "Sequence": for decoder_dict in self.tokenizer_json["decoder"]["decoders"]: diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py index 527a296e..ad294399 100644 --- a/python/openvino_tokenizers/tokenizer_pipeline.py +++ b/python/openvino_tokenizers/tokenizer_pipeline.py @@ -14,7 +14,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np -from openvino.runtime import Model, Output, PartialShape, Type, op, Shape +from openvino.runtime import Model, Output, PartialShape, Shape, Type, op from openvino.runtime import opset12 as opset from openvino.runtime.exceptions import OVTypeError, UserInputError from openvino.runtime.utils.types import as_node, make_constant_node diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py index 70951b1e..3364910a 100644 --- a/python/openvino_tokenizers/utils.py +++ b/python/openvino_tokenizers/utils.py @@ -4,7 +4,7 @@ import logging import re -from dataclasses import dataclass, fields, field +from dataclasses import dataclass, field, fields from functools import lru_cache from typing import Any, Dict, Optional, Sequence, Tuple, Union