From 280e7a8a58a3979d7955f6e12630198e40b23882 Mon Sep 17 00:00:00 2001 From: Vincent Menger Date: Fri, 1 Dec 2023 20:50:49 +0100 Subject: [PATCH] Cleanup tokenizer code --- docdeid/tokenizer.py | 41 +++++++++++++++++++++++++++++++------ tests/unit/test_tokenize.py | 7 +++++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py index e14a031..91022e4 100644 --- a/docdeid/tokenizer.py +++ b/docdeid/tokenizer.py @@ -129,22 +129,34 @@ def __init__(self, tokens: list[Token], link_tokens: bool = True) -> None: self._token_index = {token: i for i, token in enumerate(tokens)} if link_tokens: - for i in range(len(tokens) - 1): - tokens[i].set_next_token(tokens[i + 1]) - tokens[i + 1].set_previous_token(tokens[i]) + self._link_tokens() self._words: dict[str, set[str]] = {} self._text_to_tokens: dict[str, defaultdict[str, list[Token]]] = {} + def _link_tokens(self) -> None: + """Links the tokens.""" + for i in range(len(self._tokens) - 1): + self._tokens[i].set_next_token(self._tokens[i + 1]) + self._tokens[i + 1].set_previous_token(self._tokens[i]) + def token_index(self, token: Token) -> int: + """ + Find the token index in this list, i.e. its nominal position in the list. + Args: + token: The input token. + + Returns: The index in this tokenlist. + + """ return self._token_index[token] - def _init_token_lookup(self, matching_pipeline: Optional[list[StringModifier]] = None): + def _init_token_lookup(self, matching_pipeline: Optional[list[StringModifier]] = None) -> None: """ Initialize token lookup structures. - Returns: - A set of words (``string``), and a mapping of word (``string``) to one or more :class:`.Token`. + Args: + matching_pipeline: The matching pipeline to use. """ matching_pipeline = matching_pipeline or [] @@ -167,6 +179,14 @@ def _init_token_lookup(self, matching_pipeline: Optional[list[StringModifier]] = self._text_to_tokens[pipe_key] = text_to_tokens def get_words(self, matching_pipeline: Optional[list[StringModifier]] = None) -> set[str]: + """ + Get all words in this list of tokens. Evaluates lazily. + + Args: + matching_pipeline: The matching pipeline to apply. + + Returns: A set of strings, with all words in this list of tokens. + """ matching_pipeline = matching_pipeline or [] pipe_key = str(matching_pipeline) @@ -179,6 +199,15 @@ def get_words(self, matching_pipeline: Optional[list[StringModifier]] = None) -> def token_lookup( self, lookup_values: set[str], matching_pipeline: Optional[list[StringModifier]] = None ) -> set[Token]: + """ + Lookup all tokens of which the text matches a certain set of lookup values. Evaluates lazily. + + Args: + lookup_values: The set of lookup values to match the token text against. + matching_pipeline: The matching pipeline. + + Returns: A set of ``Token``, of which the text matches one of the lookup values. + """ matching_pipeline = matching_pipeline or [] pipe_key = str(matching_pipeline) diff --git a/tests/unit/test_tokenize.py b/tests/unit/test_tokenize.py index df2402d..a4916cc 100644 --- a/tests/unit/test_tokenize.py +++ b/tests/unit/test_tokenize.py @@ -69,6 +69,13 @@ def test_iterate(self, short_tokens): for i, token in enumerate(short_tokens): assert token == token_list[i] + def test_index(self, short_tokens): + + token_list = TokenList(short_tokens) + + for i, token in enumerate(short_tokens): + assert token_list.token_index(token) == i + def test_create_tokenlist_link(self, short_tokens): token_list = TokenList(short_tokens)