-
Notifications
You must be signed in to change notification settings - Fork 416
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added lora adapter save interval and modified tokenizer
Signed-off-by: ftgreat <[email protected]>
- Loading branch information
Showing
9 changed files
with
231 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
192.168.20.3 slots=1 | ||
192.168.20.2 slots=8 |
167 changes: 167 additions & 0 deletions
167
flagai/data/tokenizer/uni_tokenizer/tokenization_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
import regex as re | ||
from collections import OrderedDict | ||
from typing import Any, Dict, List, Optional, Tuple, Union, overload | ||
|
||
class Trie: | ||
""" | ||
Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass | ||
Loose reference https://en.wikipedia.org/wiki/Trie | ||
""" | ||
|
||
def __init__(self): | ||
self.data = {} | ||
|
||
def add(self, word: str): | ||
if not word: | ||
# Prevent empty string | ||
return | ||
ref = self.data | ||
for char in word: | ||
ref[char] = char in ref and ref[char] or {} | ||
ref = ref[char] | ||
ref[""] = 1 | ||
|
||
def split(self, text: str) -> List[str]: | ||
states = OrderedDict() | ||
|
||
# This will contain every indices where we need | ||
# to cut. | ||
# We force to cut at offset 0 and len(text) (added later) | ||
offsets = [0] | ||
|
||
# This is used by the lookahead which needs to skip over | ||
# some text where the full match exceeded the place in the initial | ||
# for loop | ||
skip = 0 | ||
# Main loop, Giving this algorithm O(n) complexity | ||
for current, current_char in enumerate(text): | ||
if skip and current < skip: | ||
# Prevents the lookahead for matching twice | ||
# like extra_id_100 and id_100 | ||
continue | ||
|
||
# This will track every state | ||
# that stop matching, we need to stop tracking them. | ||
# If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then | ||
# fail on "b", we need to remove 0 from the valid states. | ||
to_remove = set() | ||
# Whenever we found a match, we need to drop everything | ||
# this is a greedy algorithm, it will match on the first found token | ||
reset = False | ||
|
||
# In this case, we already have partial matches (But unfinished) | ||
for start, trie_pointer in states.items(): | ||
if "" in trie_pointer: | ||
# This is a final match, we need to reset and | ||
# store the results in `offsets`. | ||
|
||
# Lookahead to match longest first | ||
# Important in case of extra_id_1 vs extra_id_100 | ||
# Here we are also actively looking for other earlier partial | ||
# matches | ||
# "[CLS]", "L", we need to match CLS even if L is special | ||
for lookstart, looktrie_pointer in states.items(): | ||
if lookstart > start: | ||
# This partial match is later, we can stop looking | ||
break | ||
elif lookstart < start: | ||
# This partial match is earlier, the trie pointer | ||
# was already updated, so index is + 1 | ||
lookahead_index = current + 1 | ||
end = current + 1 | ||
else: | ||
# Here lookstart == start and | ||
# looktrie_pointer == trie_pointer | ||
# It wasn't updated yet so indices are current ones | ||
lookahead_index = current | ||
end = current | ||
next_char = text[lookahead_index] if lookahead_index < len(text) else None | ||
if "" in looktrie_pointer: | ||
start = lookstart | ||
end = lookahead_index | ||
skip = lookahead_index | ||
|
||
while next_char in looktrie_pointer: | ||
looktrie_pointer = looktrie_pointer[next_char] | ||
lookahead_index += 1 | ||
if "" in looktrie_pointer: | ||
start = lookstart | ||
end = lookahead_index | ||
skip = lookahead_index | ||
|
||
if lookahead_index == len(text): | ||
# End of string | ||
break | ||
next_char = text[lookahead_index] | ||
# End lookahead | ||
|
||
# Storing and resetting | ||
offsets.append(start) | ||
offsets.append(end) | ||
reset = True | ||
break | ||
elif current_char in trie_pointer: | ||
# The current character being looked at has a match within the trie | ||
# update the pointer (it will be stored back into states later). | ||
trie_pointer = trie_pointer[current_char] | ||
|
||
# Storing back the new pointer into the states. | ||
# Partial matches got longer by one. | ||
states[start] = trie_pointer | ||
else: | ||
# The new character has not match in the trie, we need | ||
# to stop keeping track of this partial match. | ||
# We can't do it directly within the loop because of how | ||
# python iteration works | ||
to_remove.add(start) | ||
|
||
# Either clearing the full start (we found a real match) | ||
# Or clearing only the partial matches that didn't work. | ||
if reset: | ||
states = {} | ||
else: | ||
for start in to_remove: | ||
del states[start] | ||
|
||
# If this character is a starting character within the trie | ||
# start keeping track of this partial match. | ||
if current >= skip and current_char in self.data: | ||
states[current] = self.data[current_char] | ||
|
||
# We have a cut at the end with states. | ||
for start, trie_pointer in states.items(): | ||
if "" in trie_pointer: | ||
# This is a final match, we need to reset and | ||
# store the results in `offsets`. | ||
end = len(text) | ||
offsets.append(start) | ||
offsets.append(end) | ||
# Longest cut is always the one with lower start so the first | ||
# item so we need to break. | ||
break | ||
|
||
return self.cut_text(text, offsets) | ||
|
||
def cut_text(self, text, offsets): | ||
# We have all the offsets now, we just need to do the actual splitting. | ||
# We need to eventually add the first part of the string and the eventual | ||
# last part. | ||
offsets.append(len(text)) | ||
tokens = [] | ||
start = 0 | ||
for end in offsets: | ||
if start > end: | ||
logger.error( | ||
"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it" | ||
" anyway." | ||
) | ||
continue | ||
elif start == end: | ||
# This might happen if there's a match at index 0 | ||
# we're also preventing zero-width cuts in case of two | ||
# consecutive matches | ||
continue | ||
tokens.append(text[start:end]) | ||
start = end | ||
|
||
return tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters