Skip to content

Commit

Permalink
fix bugs for finetuning GLM-large-en on QQP (#60)
Browse files Browse the repository at this point in the history
* test sync

* for afqmc task

Signed-off-by: marscrazy <[email protected]>

* change version to 1.0.3

Signed-off-by: marscrazy <[email protected]>

Co-authored-by: Anhforth <[email protected]>
  • Loading branch information
marscrazy and Anhforth authored Jun 15, 2022
1 parent 97da025 commit ef0894d
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 51 deletions.
14 changes: 11 additions & 3 deletions examples/glm_superglue/deepspeed.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
"allgather_bucket_size": 5e7,
"cpu_offload": true
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 1e-5,
"warmup_num_steps": 2000
}
},
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": true,
Expand All @@ -23,8 +31,8 @@
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0004,
"weight_decay": 0.01,
"lr": 1e-5,
"weight_decay": 0.1,
"betas": [
0.9,
0.98
Expand All @@ -37,4 +45,4 @@
"contiguous_memory_optimization": false
},
"wall_clock_breakdown": false
}
}
2 changes: 1 addition & 1 deletion examples/glm_superglue/hostfile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
127.0.0.1 slots=4
127.0.0.1 slots=4
28 changes: 10 additions & 18 deletions examples/glm_superglue/train_10b_clue.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,19 @@
from flagai.test_utils import CollateArguments
from flagai.data.dataset import ConstructSuperglueStrategy


task_name = 'tnews'
trainer = Trainer(env_type='deepspeed',
epochs=2,
batch_size=4,
eval_interval=10,
checkpoint_activations=False,
fp16=True,
log_interval=1,
task_name = 'afqmc'
trainer = Trainer(env_type="pytorch",
batch_size=16,
epochs=10,
log_interval=100,
eval_interval=500,
load_dir=None,
pytorch_device="cuda",
save_dir="./glm_superglue_en",
master_ip='127.0.0.1',
master_port=17235,
num_nodes=1,
num_gpus=2,
hostfile='./hostfile',
model_parallel_size=2,
deepspeed_config='./deepspeed.json',
training_script=__file__)
save_epoch=1)

model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="glm-10b-ch")
model_name="GLM-large-ch")


tokenizer = GLMLargeChTokenizer()
Expand Down
32 changes: 16 additions & 16 deletions examples/glm_superglue/train_10b_superglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,35 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments



task_name = 'boolq'
trainer = Trainer(env_type='deepspeed+mpu',
task_name = 'qqp'
trainer = Trainer(env_type='pytorch',
pytorch_device="cuda",
epochs=2,
batch_size=1,
eval_interval=1000,
checkpoint_activations=False,
fp16=True,
log_interval=1,
save_dir="./glm_superglue_en",
master_ip='127.0.0.1',
master_port=17755,
num_nodes=1,
num_gpus=2,
hostfile='./hostfile',
model_parallel_size=2,
deepspeed_config='./deepspeed.json',
training_script=__file__)
save_dir="./glm_superglue_en")
# master_ip='127.0.0.1',
# master_port=17755,
# num_nodes=1,
# num_gpus=2,
# hostfile='./hostfile',
# model_parallel_size=2,
# deepspeed_config='./deepspeed.json',
# training_script=__file__)

model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-10b-en")

tokenizer = GLM10bENBPETokenizer()
model_name="GLM-large-en")
tokenizer = GLMLargeEnWordPieceTokenizer()

train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
Expand Down
66 changes: 66 additions & 0 deletions examples/glm_superglue/train_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright © 2022 BAAI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments



task_name = 'qqp'
trainer = Trainer(env_type='deepspeed',
pytorch_device='cuda:1',
epochs=10,
batch_size=36,
eval_interval=80,
log_interval=4,
checkpoint_activations=False,
fp16=True,
warm_up=0.1,
save_dir="./glm_superglue_en",
master_ip='127.0.0.1',
master_port=17755,
num_nodes=1,
num_gpus=4,
hostfile='./hostfile',
model_parallel_size=1,
deepspeed_config='./deepspeed.json',
training_script=__file__)

model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")

#tokenizer = GLM10bENBPETokenizer()
tokenizer = GLMLargeEnWordPieceTokenizer()
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
tokenizer=tokenizer,
cloze_eval=True)
valid_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='dev',
tokenizer=tokenizer,
cloze_eval=True)

cl_args = CollateArguments()
cl_args.cloze_eval = True

if task_name in ['copa', 'wsc', 'record']:
cl_args.multi_token = True

from flagai.data.dataset import ConstructSuperglueStrategy

collate_fn = ConstructSuperglueStrategy(cl_args,
tokenizer,
task_name=task_name)


trainer.train(model,
train_dataset=train_dataset,
valid_dataset=valid_dataset,
collate_fn=collate_fn,
metric_methods=[["acc", accuracy_metric]])
23 changes: 13 additions & 10 deletions flagai/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def __init__(
gradient_accumulation_steps=1, # 'Data Loader batch size'
weight_decay=0.0, # 'weight decay coefficient for L2 regularization'
lr=1e-3,
warm_up=0.1,
epochs=0, # 'Number of finetunning epochs. Zero results in evaluation only.'
save_epoch=1, # 'number of epochs between saves')
eval_interval=1,
Expand Down Expand Up @@ -164,6 +165,7 @@ def __init__(
self.clip_grad = clip_grad
self.seed = seed
self.fp16 = fp16
self.warm_up = warm_up

self.log_interval = log_interval
self.eval_interval = eval_interval
Expand Down Expand Up @@ -291,6 +293,7 @@ def get_dataloader(self, dataset, collate_fn, shuffle=False):
return torch.utils.data.DataLoader(dataset,
batch_size=self.batch_size,
collate_fn=collate_fn,
num_workers=4,
shuffle=shuffle)
else:
if self.env_type == 'deepspeed+mpu':
Expand All @@ -305,7 +308,7 @@ def get_dataloader(self, dataset, collate_fn, shuffle=False):
return torch.utils.data.DataLoader(dataset,
batch_size=self.batch_size,
sampler=sampler,
num_workers=1,
num_workers=4,
drop_last=False,
pin_memory=False,
collate_fn=collate_fn)
Expand Down Expand Up @@ -396,13 +399,13 @@ def train(self,
cpu_torch_adam=False,
fp16=self.fp16)

# if lr_scheduler == None:
# lr_scheduler = AnnealingLR(
# optimizer,
# start_lr=self.lr,
# warmup_iter=int(0.2* self.epochs * len(train_dataloader)),
# decay_style='linear',
# num_iters=self.epochs * len(train_dataloader))
if lr_scheduler == None and 'deepspeed' not in self.env_type:
lr_scheduler = AnnealingLR(
optimizer,
start_lr=self.lr,
warmup_iter=int(self.warm_up* self.epochs * len(train_dataloader)),
decay_style='linear',
num_iters=self.epochs * len(train_dataloader))

if 'deepspeed' in self.env_type:
# initialize the deepspeed
Expand Down Expand Up @@ -708,7 +711,7 @@ def evaluate(self,
model,
mems=mems)
lm_loss= step_output['loss']
# mems = step_output['hidden_states']
# mem = step_output['hidden_states']
'''when contiguous memory optimizations are enabled, the buffers
allocated by the optimizations are deallocated during backward pass
in the absence of backward pass the buffers should be reset after each
Expand Down Expand Up @@ -740,7 +743,7 @@ def evaluate(self,
group=mpu.get_data_parallel_group())
elif self.env_type == 'deepspeed':
torch.distributed.all_reduce(
loss_data, group=deepspeed.utils.get_data_parallel_group())
loss_data)
elif self.env_type == 'pytorchDDP':
torch.distributed.all_reduce(loss_data)
loss_data = loss_data.tolist()
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="flagai",
version="v1.0.2",
version="v1.0.3",
description="FlagAI aims to help researchers and developers to freely train and test large-scale models for NLP tasks.",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
Expand All @@ -29,4 +29,4 @@
'setuptools==59.5.0',
'protobuf==3.20.1',
]
)
)
1 change: 1 addition & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
import unittest

print('test syn')
test_dir = './tests'
test_report_path = './test_report'
discover = unittest.defaultTestLoader.discover(test_dir, pattern='test_*.py')
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def suite():
suite = unittest.TestSuite()
suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_ch'))
suite.addTest(TokenizerTestCase('test_tokenizer_GLM_large_en'))
#suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en'))
suite.addTest(TokenizerTestCase('test_tokenizer_glm_10_en'))
suite.addTest(TokenizerTestCase('test_tokenizer_t5'))
suite.addTest(TokenizerTestCase('test_tokenizer_roberta'))
suite.addTest(TokenizerTestCase('test_tokenizer_bert'))
Expand Down

0 comments on commit ef0894d

Please sign in to comment.