diff --git a/examples/Aquila/Aquila-chat/Aquila-chat-lora.yaml b/examples/Aquila/Aquila-chat/Aquila-chat-lora.yaml index 0e41eaec..7d75a520 100755 --- a/examples/Aquila/Aquila-chat/Aquila-chat-lora.yaml +++ b/examples/Aquila/Aquila-chat/Aquila-chat-lora.yaml @@ -1,12 +1,13 @@ -batch_size: 1 -epochs: 5 +batch_size: 4 +epochs: 10 gradient_accumulation_steps: 1 lr: 4.0e-5 warm_up: 0.01 -lora_r: 8 +warm_up_iters: 200 +lora_r: 16 lora_alpha: 32 -save_interval: 200 -log_interval: 1 +save_interval: 500 +log_interval: 10 bmt_cpu_offload: False bmt_pre_load: True bmt_lr_decay_style: 'cosine' @@ -17,4 +18,4 @@ eps: 1.0e-8 lora: True enable_sft_dataset_dir: './data/' -enable_sft_dataset_file: 'convo_samples.jsonl' +enable_sft_dataset_file: 'sft_v0.9.10_train_chinese.jsonl' diff --git a/examples/Aquila/Aquila-chat/README.md b/examples/Aquila/Aquila-chat/README.md index c383de0a..38577d4e 100755 --- a/examples/Aquila/Aquila-chat/README.md +++ b/examples/Aquila/Aquila-chat/README.md @@ -29,7 +29,7 @@ 悟道·天鹰Aquila系列模型将持续开源更优版本,大家可以先删除原来目录下的`checkpoints_in/aquilachat-7b`,再下载新权重,其他使用方式不变。 - 2023/07/24 :发布权重文件 v0.9,开源了 AquilaCode-multi、AquilaCode-py。 AquilaChat-7B和Aquila-7B权重无更新, AquilaCode-7B-NV和AquilaCode-7B-TS权重暂时不会有更新计划。 - - Aquila-7B md5: 18eac56434db0198494b22b321633785 + - Aquila-7B md5: 5b56d31c8154c9184a38ff7bc6b4d887 - AquilaChat-7B md5: 465683009c8b536ef4cca85febb0227c - AquilaCode-multi md5:07cfce9440a0fa1ac2768b39d2cf4286 - AquilaCode-py md5:3faa85fc03d8fda70a73064f48d02d85 diff --git a/examples/Aquila/Aquila-chat/README_en.md b/examples/Aquila/Aquila-chat/README_en.md index f7a66859..52be257d 100755 --- a/examples/Aquila/Aquila-chat/README_en.md +++ b/examples/Aquila/Aquila-chat/README_en.md @@ -31,7 +31,7 @@ The additional details of the Aquila model will be presented in the official tec We will continue to release improved versions of Aquila model as open source. You can start by deleting the `checkpoints_in/aquilachat-7b` in the original directory and then download the new weights. Other usage methods remain unchanged. For more details, please refer to the folloing change log: - 2023/07/24 :Released v0.8 checkpoint files,AquilaCode-multi and AquilaCode-python have been released while AquilaCode-7B-NV and AquilaCode-7B-TS are temporarily not maintained. There are no updates for the weights of Aquila-7B and AquilaChat-7B. - - Aquila-7B md5: 18eac56434db0198494b22b321633785 + - Aquila-7B md5: 5b56d31c8154c9184a38ff7bc6b4d887 - AquilaChat-7B md5: 465683009c8b536ef4cca85febb0227c - AquilaCode-multi md5:07cfce9440a0fa1ac2768b39d2cf4286 - AquilaCode-py md5:3faa85fc03d8fda70a73064f48d02d85 diff --git a/examples/Aquila/Aquila-chat/aquila_chat.py b/examples/Aquila/Aquila-chat/aquila_chat.py index 19d2a06b..79165557 100755 --- a/examples/Aquila/Aquila-chat/aquila_chat.py +++ b/examples/Aquila/Aquila-chat/aquila_chat.py @@ -7,6 +7,7 @@ import gc gc.collect() torch.cuda.empty_cache() +import sys;sys.path.append("/data2/yzd/FlagAI") from flagai.auto_model.auto_loader import AutoLoader from flagai.data.tokenizer import Tokenizer from flagai.env_args import EnvArgs @@ -38,6 +39,7 @@ training_script=__file__, ) env_args = env_args.parse_args() + #env_args.wandb = False # overwrite diff --git a/examples/Aquila/Aquila-chat/generate_chat_lora.py b/examples/Aquila/Aquila-chat/generate_chat_lora.py index f5da5230..28551527 100755 --- a/examples/Aquila/Aquila-chat/generate_chat_lora.py +++ b/examples/Aquila/Aquila-chat/generate_chat_lora.py @@ -3,6 +3,7 @@ # Licensed under the Apache License, Version 2.0 (the "License") import os import torch +import sys;sys.path.append("/data2/yzd/FlagAI/") from flagai.auto_model.auto_loader import AutoLoader from flagai.model.predictor.predictor import Predictor from flagai.data.tokenizer import Tokenizer @@ -10,8 +11,8 @@ from flagai.model.predictor.aquila import aquila_generate -state_dict = "./checkpoints_in/" -model_name = 'aquila-7b' +state_dict = "./checkpoints_in/lora/" +model_name = 'aquilachat-7b' loader = AutoLoader("lm", @@ -20,7 +21,7 @@ use_cache=True, fp16=True, device='cuda', - adapter_dir='/data2/yzd/FlagAI/examples/Aquila/Aquila-chat/checkpoints_out/aquila_experiment/2023080216/') # Directory to adapter_model.bin and adapter_config.json + adapter_dir='/data2/yzd/FlagAI/examples/Aquila/Aquila-chat/checkpoints_out/aquilachat_experiment/2023081313/') # Directory to adapter_model.bin and adapter_config.json model = loader.get_model() tokenizer = loader.get_tokenizer() @@ -37,6 +38,7 @@ "Write a short story about a dragon and a knight.", "翻译成英文: '我饿了想吃饭'", "write a fairy tale for me", + "用英文回答: 世界上最高的地方在哪里?" ] for text in texts: diff --git a/examples/Aquila/Aquila-chat/hostfile b/examples/Aquila/Aquila-chat/hostfile index eb54b83d..6d553c87 100755 --- a/examples/Aquila/Aquila-chat/hostfile +++ b/examples/Aquila/Aquila-chat/hostfile @@ -1 +1 @@ -192.168.20.3 slots=1 +192.168.20.2 slots=8 diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenization_utils.py b/flagai/data/tokenizer/uni_tokenizer/tokenization_utils.py new file mode 100755 index 00000000..8ec66b77 --- /dev/null +++ b/flagai/data/tokenizer/uni_tokenizer/tokenization_utils.py @@ -0,0 +1,167 @@ +import regex as re +from collections import OrderedDict +from typing import Any, Dict, List, Optional, Tuple, Union, overload + +class Trie: + """ + Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass + Loose reference https://en.wikipedia.org/wiki/Trie + """ + + def __init__(self): + self.data = {} + + def add(self, word: str): + if not word: + # Prevent empty string + return + ref = self.data + for char in word: + ref[char] = char in ref and ref[char] or {} + ref = ref[char] + ref[""] = 1 + + def split(self, text: str) -> List[str]: + states = OrderedDict() + + # This will contain every indices where we need + # to cut. + # We force to cut at offset 0 and len(text) (added later) + offsets = [0] + + # This is used by the lookahead which needs to skip over + # some text where the full match exceeded the place in the initial + # for loop + skip = 0 + # Main loop, Giving this algorithm O(n) complexity + for current, current_char in enumerate(text): + if skip and current < skip: + # Prevents the lookahead for matching twice + # like extra_id_100 and id_100 + continue + + # This will track every state + # that stop matching, we need to stop tracking them. + # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then + # fail on "b", we need to remove 0 from the valid states. + to_remove = set() + # Whenever we found a match, we need to drop everything + # this is a greedy algorithm, it will match on the first found token + reset = False + + # In this case, we already have partial matches (But unfinished) + for start, trie_pointer in states.items(): + if "" in trie_pointer: + # This is a final match, we need to reset and + # store the results in `offsets`. + + # Lookahead to match longest first + # Important in case of extra_id_1 vs extra_id_100 + # Here we are also actively looking for other earlier partial + # matches + # "[CLS]", "L", we need to match CLS even if L is special + for lookstart, looktrie_pointer in states.items(): + if lookstart > start: + # This partial match is later, we can stop looking + break + elif lookstart < start: + # This partial match is earlier, the trie pointer + # was already updated, so index is + 1 + lookahead_index = current + 1 + end = current + 1 + else: + # Here lookstart == start and + # looktrie_pointer == trie_pointer + # It wasn't updated yet so indices are current ones + lookahead_index = current + end = current + next_char = text[lookahead_index] if lookahead_index < len(text) else None + if "" in looktrie_pointer: + start = lookstart + end = lookahead_index + skip = lookahead_index + + while next_char in looktrie_pointer: + looktrie_pointer = looktrie_pointer[next_char] + lookahead_index += 1 + if "" in looktrie_pointer: + start = lookstart + end = lookahead_index + skip = lookahead_index + + if lookahead_index == len(text): + # End of string + break + next_char = text[lookahead_index] + # End lookahead + + # Storing and resetting + offsets.append(start) + offsets.append(end) + reset = True + break + elif current_char in trie_pointer: + # The current character being looked at has a match within the trie + # update the pointer (it will be stored back into states later). + trie_pointer = trie_pointer[current_char] + + # Storing back the new pointer into the states. + # Partial matches got longer by one. + states[start] = trie_pointer + else: + # The new character has not match in the trie, we need + # to stop keeping track of this partial match. + # We can't do it directly within the loop because of how + # python iteration works + to_remove.add(start) + + # Either clearing the full start (we found a real match) + # Or clearing only the partial matches that didn't work. + if reset: + states = {} + else: + for start in to_remove: + del states[start] + + # If this character is a starting character within the trie + # start keeping track of this partial match. + if current >= skip and current_char in self.data: + states[current] = self.data[current_char] + + # We have a cut at the end with states. + for start, trie_pointer in states.items(): + if "" in trie_pointer: + # This is a final match, we need to reset and + # store the results in `offsets`. + end = len(text) + offsets.append(start) + offsets.append(end) + # Longest cut is always the one with lower start so the first + # item so we need to break. + break + + return self.cut_text(text, offsets) + + def cut_text(self, text, offsets): + # We have all the offsets now, we just need to do the actual splitting. + # We need to eventually add the first part of the string and the eventual + # last part. + offsets.append(len(text)) + tokens = [] + start = 0 + for end in offsets: + if start > end: + logger.error( + "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it" + " anyway." + ) + continue + elif start == end: + # This might happen if there's a match at index 0 + # we're also preventing zero-width cuts in case of two + # consecutive matches + continue + tokens.append(text[start:end]) + start = end + + return tokens \ No newline at end of file diff --git a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py index 7bf9ad68..efbe6ecd 100755 --- a/flagai/data/tokenizer/uni_tokenizer/tokenizer.py +++ b/flagai/data/tokenizer/uni_tokenizer/tokenizer.py @@ -32,6 +32,7 @@ from typing import List, Union, Optional import unicodedata import json +from flagai.data.tokenizer.uni_tokenizer.tokenization_utils import Trie def is_control(ch): @@ -372,7 +373,6 @@ def __init__(self, if tk not in self.command_name_map: res = self.search_special(tk) self.add_command_token(tk, res,self.tokenizer_class) - self.command_name_map = {tok.name: tok for tok in self._command_tokens} self.command_token_map = { tok.token: tok @@ -392,7 +392,8 @@ def __init__(self, self.token_end_id = self.TokenToId("<|endoftext|>") except KeyError: self.token_end_id = self.TokenToId("[SEP]") - + + self.tokens_trie = Trie() #print("All special tokens: ", str([(k, v.token, v.Id) for k,v in self.command_name_map.items()])) def get_vocab(self): @@ -455,7 +456,7 @@ def _is_special(ch): return bool(ch) and (ch[0] == '[') and (ch[-1] == ']') def _encode(self, text): - tokens = self.text_tokenizer.tokenize(text) + tokens = self.tokenize(text) ids = self.text_tokenizer.convert_tokens_to_ids(tokens) return ids @@ -484,7 +485,7 @@ def EncodeAsTokens(self, text, process_fn=None): processed_text = text if process_fn is not None: processed_text = process_fn(processed_text) - tokens = self.text_tokenizer.tokenize(processed_text) + tokens = self.tokenize(processed_text) return tokens def IdToToken(self, id): @@ -521,10 +522,10 @@ def DecodeIds(self, ids): tokens, self.command_token_map) def encode(self, text): - if hasattr(self.text_tokenizer, "encode"): - return self.text_tokenizer.encode(text) + # if hasattr(self.text_tokenizer, "encode"): + # return self.text_tokenizer.encode(text) return self.convert_tokens_to_ids( - self.text_tokenizer.tokenize(text)) + self.tokenize(text)) def decode(self, ids): if hasattr(self.text_tokenizer, "decode"): @@ -791,8 +792,14 @@ def tokenize_as_tensor(self, texts): sot_token=sot_token, eot_token=eot_token) - def tokenize(self, text, maxlen=None, add_spatial_tokens=False): - tokens = self.text_tokenizer.tokenize(text) + def _create_trie(self, unique_no_split_tokens): + trie = Trie() + for token in unique_no_split_tokens: + trie.add(token) + self.tokens_trie = trie + + def _tokenize(self, text, maxlen=None, add_spatial_tokens=False): + tokens = self.tokenize(text, max_len=max_len, add_spatial_tokens=add_spatial_tokens) if add_spatial_tokens: tokens.insert(0, self.get_command_id('cls')) @@ -803,6 +810,31 @@ def tokenize(self, text, maxlen=None, add_spatial_tokens=False): self.truncate_sequence(maxlen, tokens, pop_index=-index) return tokens + def tokenize(self, text, maxlen=None, add_spatial_tokens=False): + tokens = self.tokens_trie.split(text) + # ["This is something", "", " else"] + for i, token in enumerate(tokens): + if token in self._command_token_tokens: + left = tokens[i - 1] if i > 0 else None + right = tokens[i + 1] if i < len(tokens) - 1 else None + # We strip left and right by default + if right: + tokens[i + 1] = right.lstrip() + if left: + tokens[i - 1] = left.rstrip() + # ["This is something", "", "else"] + tokenized_text = [] + for token in tokens: + # Need to skip eventual empty (fully stripped) tokens + if not token: + continue + if token in self._command_token_tokens: + tokenized_text.append(token) + else: + tokenized_text.extend(self._tokenize(token)) + # ["This", " is", " something", "", "else"] + return tokenized_text + def search_special(self, name): if name == "cls": if self.check_special(''): return '' diff --git a/flagai/env_trainer_v1.py b/flagai/env_trainer_v1.py index de7a1a39..71dadcf3 100755 --- a/flagai/env_trainer_v1.py +++ b/flagai/env_trainer_v1.py @@ -676,7 +676,9 @@ def do_train(self, if self.save_dir and (self.iteration + 1) % self.save_interval == 0 and \ self.iteration != best_iteration: if self.adapter_save: - self.model.save_pretrained(save_directory=self.save_dir) + save_dir = self.save_dir+'/'+str(self.iteration) + os.makedirs(save_dir,exist_ok=True) + self.model.save_pretrained(save_directory=save_dir) else: best_iteration = self.iteration save_checkpoint(self.iteration+1, @@ -697,7 +699,9 @@ def do_train(self, # self.iteration-1 as the exact iteration if self.save_dir and (self.iteration-1) != best_iteration: if self.adapter_save: - self.model.save_pretrained(save_directory=self.save_dir) + save_dir = self.save_dir+'/'+str(self.iteration) + os.makedirs(save_dir,exist_ok=True) + self.model.save_pretrained(save_directory=save_dir) else: save_checkpoint(self.iteration+1, best_iteration+1,