diff --git a/chatglm_tokenizer/tokenization_chatglm.py b/chatglm_tokenizer/tokenization_chatglm.py index d4ce416..96aa0bd 100644 --- a/chatglm_tokenizer/tokenization_chatglm.py +++ b/chatglm_tokenizer/tokenization_chatglm.py @@ -66,11 +66,11 @@ class ChatGLMTokenizer(PreTrainedTokenizer): model_input_names = ["input_ids", "attention_mask", "position_ids"] def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.tokenizer = SPTokenizer(vocab_file) super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) self.name = "GLMTokenizer" self.vocab_file = vocab_file - self.tokenizer = SPTokenizer(vocab_file) self.special_tokens = { "": self.tokenizer.bos_id, "": self.tokenizer.eos_id, diff --git a/data_process.py b/data_process.py index 80dca8b..6d81389 100644 --- a/data_process.py +++ b/data_process.py @@ -4,40 +4,45 @@ from tqdm import tqdm from chatglm_tokenizer.tokenization_chatglm import ChatGLMTokenizer import pandas as pd -#from zhconv import convert +import os + + +# from zhconv import convert def process_wiki_clean(): - with open('./data/wikipedia_cn_20230720/wikipedia-cn-20230720-filtered.json','r',encoding='utf-8') as f: - data=json.load(f) - doc_ids=[] + with open('./data/wikipedia_cn_20230720/wikipedia-cn-20230720-filtered.json', 'r', encoding='utf-8') as f: + data = json.load(f) + doc_ids = [] for line in tqdm(data): - text=line['completion'] - text_id=tokenizer.encode(text,add_special_tokens=False) + text = line['completion'] + text_id = tokenizer.encode(text, add_special_tokens=False) text_id.append(tokenizer.special_tokens['']) - if len(text_id)>5: - doc_ids+=text_id - arr = np.array(doc_ids,dtype=np.uint16) - with open('./data/wiki.bin','wb') as f: + if len(text_id) > 5: + doc_ids += text_id + arr = np.array(doc_ids, dtype=np.uint16) + with open('./data/wiki.bin', 'wb') as f: f.write(arr.tobytes()) -def process_medical(data_path,name): - f=open(data_path,'r',encoding='utf-8') - doc_ids=[] + +def process_medical(data_path, name): + f = open(data_path, 'r', encoding='utf-8') + doc_ids = [] while True: - line=f.readline() + line = f.readline() if not line: break - line=json.loads(line) - text=line['text'] - text_id=tokenizer.encode(text,add_special_tokens=False) + line = json.loads(line) + text = line['text'] + text_id = tokenizer.encode(text, add_special_tokens=False) text_id.append(tokenizer.special_tokens['']) - if len(text_id)>5: - doc_ids+=text_id - arr = np.array(doc_ids,dtype=np.uint16) - with open('./data/medical_{}.bin'.format(name),'wb') as f: - f.write(arr.tobytes()) + if len(text_id) > 5: + doc_ids += text_id + arr = np.array(doc_ids, dtype=np.uint16) + with open('./data/medical_{}.bin'.format(name), 'wb') as f: + f.write(arr.tobytes()) + def sft_to_pretrain(): - doc_ids=[] + doc_ids = [] ''' df=pd.read_csv('./data/medical_qa_144w.csv') @@ -53,172 +58,189 @@ def sft_to_pretrain(): doc_ids+=text_id ''' - with open('./data/shibing624_medical/finetune/train_en_1.json','r',encoding='utf-8') as f: + with open('./data/shibing624_medical/finetune/train_en_1.json', 'r', encoding='utf-8') as f: for row in f: - line=json.loads(row) - q=line['input'] - a=line['output'] - q_id=tokenizer.encode(q,add_special_tokens=False) - a_id=tokenizer.encode(a,add_special_tokens=False) - text_id=q_id+a_id+[tokenizer.special_tokens['']] - if len(text_id)>5: - doc_ids+=text_id - with open('./data/shibing624_medical/finetune/test_en_1.json','r',encoding='utf-8') as f: + line = json.loads(row) + q = line['input'] + a = line['output'] + q_id = tokenizer.encode(q, add_special_tokens=False) + a_id = tokenizer.encode(a, add_special_tokens=False) + text_id = q_id + a_id + [tokenizer.special_tokens['']] + if len(text_id) > 5: + doc_ids += text_id + with open('./data/shibing624_medical/finetune/test_en_1.json', 'r', encoding='utf-8') as f: for row in f: - line=json.loads(row) - q=line['input'] - a=line['output'] - q_id=tokenizer.encode(q,add_special_tokens=False) - a_id=tokenizer.encode(a,add_special_tokens=False) - text_id=q_id+a_id+[tokenizer.special_tokens['']] - if len(text_id)>5: - doc_ids+=text_id - with open('./data/shibing624_medical/finetune/valid_en_1.json','r',encoding='utf-8') as f: + line = json.loads(row) + q = line['input'] + a = line['output'] + q_id = tokenizer.encode(q, add_special_tokens=False) + a_id = tokenizer.encode(a, add_special_tokens=False) + text_id = q_id + a_id + [tokenizer.special_tokens['']] + if len(text_id) > 5: + doc_ids += text_id + with open('./data/shibing624_medical/finetune/valid_en_1.json', 'r', encoding='utf-8') as f: for row in f: - line=json.loads(row) - q=line['input'] - a=line['output'] - q_id=tokenizer.encode(q,add_special_tokens=False) - a_id=tokenizer.encode(a,add_special_tokens=False) - text_id=q_id+a_id+[tokenizer.special_tokens['']] - if len(text_id)>5: - doc_ids+=text_id + line = json.loads(row) + q = line['input'] + a = line['output'] + q_id = tokenizer.encode(q, add_special_tokens=False) + a_id = tokenizer.encode(a, add_special_tokens=False) + text_id = q_id + a_id + [tokenizer.special_tokens['']] + if len(text_id) > 5: + doc_ids += text_id - with open('./data/shibing624_medical/finetune/train_zh_0.json','r',encoding='utf-8') as f: + with open('./data/shibing624_medical/finetune/train_zh_0.json', 'r', encoding='utf-8') as f: for row in f: - line=json.loads(row) - q=line['instruction']+line['input'] - a=line['output'] - q_id=tokenizer.encode(q,add_special_tokens=False) - a_id=tokenizer.encode(a,add_special_tokens=False) - text_id=q_id+a_id+[tokenizer.special_tokens['']] - if len(text_id)>5: - doc_ids+=text_id - with open('./data/shibing624_medical/finetune/test_zh_0.json','r',encoding='utf-8') as f: + line = json.loads(row) + q = line['instruction'] + line['input'] + a = line['output'] + q_id = tokenizer.encode(q, add_special_tokens=False) + a_id = tokenizer.encode(a, add_special_tokens=False) + text_id = q_id + a_id + [tokenizer.special_tokens['']] + if len(text_id) > 5: + doc_ids += text_id + with open('./data/shibing624_medical/finetune/test_zh_0.json', 'r', encoding='utf-8') as f: for row in f: - line=json.loads(row) - q=line['instruction']+line['input'] - a=line['output'] - q_id=tokenizer.encode(q,add_special_tokens=False) - a_id=tokenizer.encode(a,add_special_tokens=False) - text_id=q_id+a_id+[tokenizer.special_tokens['']] - if len(text_id)>5: - doc_ids+=text_id - with open('./data/shibing624_medical/finetune/valid_zh_0.json','r',encoding='utf-8') as f: + line = json.loads(row) + q = line['instruction'] + line['input'] + a = line['output'] + q_id = tokenizer.encode(q, add_special_tokens=False) + a_id = tokenizer.encode(a, add_special_tokens=False) + text_id = q_id + a_id + [tokenizer.special_tokens['']] + if len(text_id) > 5: + doc_ids += text_id + with open('./data/shibing624_medical/finetune/valid_zh_0.json', 'r', encoding='utf-8') as f: for row in f: - line=json.loads(row) - q=line['instruction']+line['input'] - a=line['output'] - q_id=tokenizer.encode(q,add_special_tokens=False) - a_id=tokenizer.encode(a,add_special_tokens=False) - text_id=q_id+a_id+[tokenizer.special_tokens['']] - if len(text_id)>5: - doc_ids+=text_id + line = json.loads(row) + q = line['instruction'] + line['input'] + a = line['output'] + q_id = tokenizer.encode(q, add_special_tokens=False) + a_id = tokenizer.encode(a, add_special_tokens=False) + text_id = q_id + a_id + [tokenizer.special_tokens['']] + if len(text_id) > 5: + doc_ids += text_id - arr = np.array(doc_ids,dtype=np.uint16) + arr = np.array(doc_ids, dtype=np.uint16) print(arr.shape) - with open('./data/medical_qa.bin','wb') as f: + with open('./data/medical_qa.bin', 'wb') as f: f.write(arr.tobytes()) + def process_baidu(): BATCH_SIZE = 1000000 - cnt=0 - batch_cnt=0 - token=0 - doc_ids=[] + cnt = 0 + batch_cnt = 0 + token = 0 + doc_ids = [] + + f1 = open('./data/563w_baidubaike/563w_baidubaike.json', 'r', encoding='utf-8') - f1=open('./data/563w_baidubaike/563w_baidubaike.json','r',encoding='utf-8') - while True: line = f1.readline() if not line: break - line=json.loads(line) - text='' + line = json.loads(line) + text = '' try: - text+=line['title']+':'+line['summary'] + text += line['title'] + ':' + line['summary'] except: pass for per in line['sections']: - text+=per['title']+':'+per['content']+'。' - text_id=tokenizer.encode(text,add_special_tokens=False) + text += per['title'] + ':' + per['content'] + '。' + text_id = tokenizer.encode(text, add_special_tokens=False) text_id.append(tokenizer.special_tokens['']) - if len(text_id)>5: - doc_ids+=text_id - cnt+=1 - if cnt%BATCH_SIZE==0: - batch_cnt+=1 - arr = np.array(doc_ids,dtype=np.uint16) - doc_ids=[] - print('cnt:',cnt,'arr_shape:',arr.shape) - with open('./data/baidubaike_563w_{}.bin'.format(batch_cnt),'wb') as f2: + if len(text_id) > 5: + doc_ids += text_id + cnt += 1 + if cnt % BATCH_SIZE == 0: + batch_cnt += 1 + arr = np.array(doc_ids, dtype=np.uint16) + doc_ids = [] + print('cnt:', cnt, 'arr_shape:', arr.shape) + with open('./data/baidubaike_563w_{}.bin'.format(batch_cnt), 'wb') as f2: f2.write(arr.tobytes()) del arr if not doc_ids: - batch_cnt+=1 - arr = np.array(doc_ids,dtype=np.uint16) - print('cnt:',cnt,'arr_shape:',arr.shape) - with open('./data/baidubaike_563w_{}.bin'.format(batch_cnt),'wb') as f: + batch_cnt += 1 + arr = np.array(doc_ids, dtype=np.uint16) + print('cnt:', cnt, 'arr_shape:', arr.shape) + with open('./data/baidubaike_563w_{}.bin'.format(batch_cnt), 'wb') as f: f.write(arr.tobytes()) - + + def process_c4(): c4_zh_paths = glob.glob('./data/c4_zh/*') - c4_zh_paths=sorted(c4_zh_paths) + c4_zh_paths = sorted(c4_zh_paths) print(len(c4_zh_paths)) - cnt=0 - token=0 - doc_ids=[] + cnt = 0 + token = 0 + doc_ids = [] for per in tqdm(c4_zh_paths): - with open(per,'r') as f: + with open(per, 'r') as f: for line in f: text = json.loads(line) text = text['text'] - text_id=tokenizer.encode(text,add_special_tokens=False) + text_id = tokenizer.encode(text, add_special_tokens=False) text_id.append(tokenizer.special_tokens['']) - if len(text_id)>5: - doc_ids+=text_id - cnt+=1 + if len(text_id) > 5: + doc_ids += text_id + cnt += 1 - arr = np.array(doc_ids,dtype=np.uint16) - with open('./data/c4_zh.bin','wb') as f: + arr = np.array(doc_ids, dtype=np.uint16) + with open('./data/c4_zh.bin', 'wb') as f: f.write(arr.tobytes()) print(arr.shape) -def process_wudao(): + +def process_wudao(slice_size): wudao_zh_paths = glob.glob('./data/WuDaoCorpus2.0_base_200G/*') - wudao_zh_paths=sorted(wudao_zh_paths) - print(len(wudao_zh_paths))#很多子文件 - cnt=0 - token=0 - doc_ids=[] - for per in tqdm(wudao_zh_paths[320:]):#wudao_zh_paths[i:j]手动分片,一片片处理,不然太大一次性处理不完 - with open(per,'r') as f: - data=json.load(f) - for text in data: - text = text['title'] + text['content'] - text_id=tokenizer.encode(text,add_special_tokens=False) - text_id.append(tokenizer.special_tokens['']) - if len(text_id)>5: - doc_ids+=text_id - # - # if cnt%10000==0: - # print(cnt) - cnt+=1 - #token+=len(text_id) - #break - # - # arr = np.array(doc_ids,dtype=np.uint16) - # with open('./data/c4-zh/{}.bin'.format(per.split('/')[-1].split('.')[0]),'wb') as f: - # f.write(arr.tobytes()) - # print(arr.shape) - arr = np.array(doc_ids,dtype=np.uint16) - with open('./data/wudaocorpus_zh_16.bin','wb') as f: - f.write(arr.tobytes()) - print(arr.shape) + wudao_zh_paths = sorted(wudao_zh_paths) + print(len(wudao_zh_paths)) # 很多子文件 + + def _internal_wudao_process(idx, slice_data): + cnt = 0 + token = 0 + doc_ids = [] + for per in tqdm(slice_data): # wudao_zh_paths[i:j]手动分片,一片片处理,不然太大一次性处理不完 + with open(per, 'r') as f: + data = json.load(f) + for text in data: + text = text['title'] + text['content'] + text_id = tokenizer.encode(text, add_special_tokens=False) + text_id.append(tokenizer.special_tokens['']) + if len(text_id) > 5: + doc_ids += text_id + # + # if cnt%10000==0: + # print(cnt) + cnt += 1 + # token+=len(text_id) + # break + # + # arr = np.array(doc_ids,dtype=np.uint16) + # with open('./data/c4-zh/{}.bin'.format(per.split('/')[-1].split('.')[0]),'wb') as f: + # f.write(arr.tobytes()) + # print(arr.shape) + arr = np.array(doc_ids, dtype=np.uint16) + with open(f'./data/wudaocorpus_zh_{idx}.bin', 'wb') as f: + f.write(arr.tobytes()) + print(arr.shape) + + max_idx = 0 + for idx, i in enumerate(range(len(wudao_zh_paths))[::slice_size]): + if i + slice_size < len(wudao_zh_paths): + slice_data = wudao_zh_paths[i:i + slice_size] + else: + slice_data = wudao_zh_paths[i:] + _internal_wudao_process(idx, slice_data) + max_idx = idx -if __name__=="__main__": + return max_idx + + +if __name__ == "__main__": tokenizer = ChatGLMTokenizer(vocab_file='./chatglm_tokenizer/tokenizer.model') # 数据预处理-如果下载分词处理后的数据,可以不用执行以下函数 # process_wiki_clean() @@ -226,12 +248,12 @@ def process_wudao(): # process_medical('./data/shibing624_medical/pretrain/train_encyclopedia.json','encyclopedia') # process_baidu() # process_c4() - # process_wudao() + # max_idx = process_wudao(slice_size=10) # print('data processing finished!') # 分词处理后的文件列表 - data_path_list=[ + data_path_list = [ './data/baidubaike_563w_1.bin', './data/baidubaike_563w_2.bin', './data/baidubaike_563w_3.bin', @@ -266,13 +288,19 @@ def process_wudao(): './data/wudaocorpus_zh_14.bin', './data/wudaocorpus_zh_15.bin', './data/wudaocorpus_zh_16.bin', - ] - data_lst=[] + ] \ + # .extend([f'./data/wudaocorpus_zh_{i}.bin' for i in range(max_idx)]) + + if os.path.exists('./data/pretrain_data.bin'): + print("Warning: The pretrain data is existed, " + "your operation will be added at the end of the file.") + for data_path in tqdm(data_path_list): - with open(data_path,'rb') as f: - data=np.fromfile(f,dtype=np.uint16) + data_lst = [] + with open(data_path, 'rb') as f: + data = np.fromfile(f, dtype=np.uint16) data_lst.append(data) - arr = np.concatenate(data_lst) - print(arr.shape) - with open('./data/pretrain_data.bin','wb') as f: - f.write(arr.tobytes()) + arr = np.concatenate(data_lst) + # print(arr.shape) + with open('./data/pretrain_data.bin', 'ab') as f: + f.write(arr.tobytes())