Skip to content

Commit

Permalink
Cleanup tokenizer code
Browse files Browse the repository at this point in the history
  • Loading branch information
vmenger committed Dec 1, 2023
1 parent eeffb20 commit 280e7a8
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 6 deletions.
41 changes: 35 additions & 6 deletions docdeid/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,22 +129,34 @@ def __init__(self, tokens: list[Token], link_tokens: bool = True) -> None:
self._token_index = {token: i for i, token in enumerate(tokens)}

if link_tokens:
for i in range(len(tokens) - 1):
tokens[i].set_next_token(tokens[i + 1])
tokens[i + 1].set_previous_token(tokens[i])
self._link_tokens()

self._words: dict[str, set[str]] = {}
self._text_to_tokens: dict[str, defaultdict[str, list[Token]]] = {}

def _link_tokens(self) -> None:
"""Links the tokens."""
for i in range(len(self._tokens) - 1):
self._tokens[i].set_next_token(self._tokens[i + 1])
self._tokens[i + 1].set_previous_token(self._tokens[i])

def token_index(self, token: Token) -> int:
"""
Find the token index in this list, i.e. its nominal position in the list.
Args:
token: The input token.
Returns: The index in this tokenlist.
"""
return self._token_index[token]

def _init_token_lookup(self, matching_pipeline: Optional[list[StringModifier]] = None):
def _init_token_lookup(self, matching_pipeline: Optional[list[StringModifier]] = None) -> None:
"""
Initialize token lookup structures.
Returns:
A set of words (``string``), and a mapping of word (``string``) to one or more :class:`.Token`.
Args:
matching_pipeline: The matching pipeline to use.
"""

matching_pipeline = matching_pipeline or []
Expand All @@ -167,6 +179,14 @@ def _init_token_lookup(self, matching_pipeline: Optional[list[StringModifier]] =
self._text_to_tokens[pipe_key] = text_to_tokens

def get_words(self, matching_pipeline: Optional[list[StringModifier]] = None) -> set[str]:
"""
Get all words in this list of tokens. Evaluates lazily.
Args:
matching_pipeline: The matching pipeline to apply.
Returns: A set of strings, with all words in this list of tokens.
"""

matching_pipeline = matching_pipeline or []
pipe_key = str(matching_pipeline)
Expand All @@ -179,6 +199,15 @@ def get_words(self, matching_pipeline: Optional[list[StringModifier]] = None) ->
def token_lookup(
self, lookup_values: set[str], matching_pipeline: Optional[list[StringModifier]] = None
) -> set[Token]:
"""
Lookup all tokens of which the text matches a certain set of lookup values. Evaluates lazily.
Args:
lookup_values: The set of lookup values to match the token text against.
matching_pipeline: The matching pipeline.
Returns: A set of ``Token``, of which the text matches one of the lookup values.
"""

matching_pipeline = matching_pipeline or []
pipe_key = str(matching_pipeline)
Expand Down
7 changes: 7 additions & 0 deletions tests/unit/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ def test_iterate(self, short_tokens):
for i, token in enumerate(short_tokens):
assert token == token_list[i]

def test_index(self, short_tokens):

token_list = TokenList(short_tokens)

for i, token in enumerate(short_tokens):
assert token_list.token_index(token) == i

def test_create_tokenlist_link(self, short_tokens):
token_list = TokenList(short_tokens)

Expand Down

0 comments on commit 280e7a8

Please sign in to comment.