Skip to content

Commit

Permalink
Added CLIP module and redesigned tokenizer apis (#81)
Browse files Browse the repository at this point in the history
* merged clip tokenizer

Signed-off-by: BAAI-OpenPlatform <[email protected]>

* Update inference_clip.py

* Update auto_loader.py

* Update glm_10b_en_tokenizer.py

* swinv1v2

Signed-off-by: zhaohu xing <[email protected]>

* updated the version

Signed-off-by: Anhforth <[email protected]>

* updated the requirement packages list

Signed-off-by: Anhforth <[email protected]>

* fixed some issues

Signed-off-by: BAAI-OpenPlatform <[email protected]>

* fixed some issues

Signed-off-by: BAAI-OpenPlatform <[email protected]>

* tried to fix the data directory not found error

Signed-off-by: BAAI-OpenPlatform <[email protected]>

* fixed issues in running glm_seq2seq

Signed-off-by: BAAI-OpenPlatform <[email protected]>

* Update test_glm_seq2seq.py

* Update setup.py

Signed-off-by: Anhforth <[email protected]>
Signed-off-by: ZhaodongYan1 <[email protected]>
Signed-off-by: zhaohu xing <[email protected]>
Signed-off-by: shunxing1234 <[email protected]>
Signed-off-by: BAAI-OpenPlatform <[email protected]>
Co-authored-by: Anhforth <[email protected]>
Co-authored-by: zhaohu xing <[email protected]>
Co-authored-by: Zhaodong Yan <[email protected]>
Co-authored-by: ZhaodongYan1 <[email protected]>
Co-authored-by: Zac Liu <[email protected]>
Co-authored-by: zhaohu xing <[email protected]>
Co-authored-by: jongjyh <[email protected]>
Co-authored-by: wchh-2000 <[email protected]>
Co-authored-by: xuanricheng <[email protected]>
Co-authored-by: shunxing1234 <[email protected]>
  • Loading branch information
11 people authored Aug 29, 2022
1 parent 693f9a4 commit dee25b7
Show file tree
Hide file tree
Showing 88 changed files with 4,599 additions and 610 deletions.
2 changes: 1 addition & 1 deletion doc_zh/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class GLMTitleGenerationCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMTitleGenerationDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion doc_zh/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ class GLMPoetryDynamicCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMPoetryDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/TUTORIAL_12_GLM_EXAMPLE_TITLE_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class GLMTitleGenerationCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMTitleGenerationCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMTitleGenerationDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion docs/TUTORIAL_13_GLM_EXAMPLE_PEOTRY_GENERATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ class GLMPoetryDynamicCollateFN():
```python
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command('pad').Id)
my_collate_fn = GLMPoetryDynamicCollateFN(pad_id=tokenizer.get_command_id('pad'))
train_dataset = GLMPoetryDataset(train_src,
train_tgt)
```
Expand Down
2 changes: 1 addition & 1 deletion examples/bert_title_generation_english/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
maxlen = 512
auto_loader = AutoLoader(
"seq2seq",
model_name="bert-base-uncased",
model_name="BERT-base-en",
model_dir=model_dir,
)
model = auto_loader.get_model()
Expand Down
Binary file added examples/clip/CLIP.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/clip/data/img/0.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/clip/data/img/1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions examples/clip/data/pairs.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
title filepath
a very typical bus station 0.jpg
the jetty : different types of plants to establish a variety of ecosystems . 1.jpg
48 changes: 48 additions & 0 deletions examples/clip/deepspeed.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"train_micro_batch_size_per_gpu": 64,
"gradient_accumulation_steps": 1,
"steps_per_print": 100,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 2,
"contiguous_gradients": false,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e7,
"allgather_bucket_size": 5e7,
"cpu_offload": true
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 1e-5,
"warmup_num_steps": 2000
}
},
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 1e-5,
"weight_decay": 0.1,
"betas": [
0.9,
0.98
],
"eps": 1e-6
}
},
"activation_checkpointing": {
"partition_activations": true,
"contiguous_memory_optimization": false
},
"wall_clock_breakdown": false
}
1 change: 1 addition & 0 deletions examples/clip/hostfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
127.0.0.1 slots=2
30 changes: 30 additions & 0 deletions examples/clip/inference_clip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import torch
from PIL import Image
from flagai.auto_model.auto_loader import AutoLoader
from flagai.data.dataset.mm.clip_dataset import clip_transform

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loader = AutoLoader(task_name="txt_img_matching", #contrastive learning
model_name="clip-base-p32-224")

model = loader.get_model()
model.eval()
model.to(device)
tokenizer = loader.get_tokenizer()
transform = clip_transform(img_size=model.image_size)

def inference():
image = Image.open("./CLIP.png")
image = transform(image).unsqueeze(0).to(device)
text = tokenizer.tokenize_as_tensor(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
text_probs = (image_features @ text_features.T).softmax(dim=-1)

print(text_probs.cpu().numpy()[0].tolist())

if __name__=="__main__":
inference()
36 changes: 36 additions & 0 deletions examples/clip/train_clip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import torch
from flagai.data.dataset.mm.clip_dataset import CsvDataset, clip_transform, collate_fn
from flagai.trainer import Trainer
from flagai.auto_model.auto_loader import AutoLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# cd examples/clip
data_path = "./data/pairs.csv"
img_dir = "./data/img"

trainer = Trainer(env_type="pytorch",
epochs=5,
pytorch_device=device,
batch_size=64,
lr=1e-4,
log_interval=10,
)

loader = AutoLoader(task_name="txt_img_matching",#contrastive learning
model_name="clip-base-p32-224",
)
model = loader.get_model()
tokenizer = loader.get_tokenizer()

transform = clip_transform(img_size=model.image_size)
train_dataset = CsvDataset(data_path,
img_dir,
transform,
tokenizer)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
trainer.train(model,
optimizer=optimizer,
train_dataset=train_dataset,
collate_fn=collate_fn)

48 changes: 48 additions & 0 deletions examples/clip/train_clip_deepspeed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import torch
from flagai.data.dataset.mm.clip_dataset import CsvDataset, clip_transform, collate_fn
from flagai.trainer import Trainer
from flagai.auto_model.auto_loader import AutoLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# cd examples/clip
data_path = "./data/pairs.csv"#"/mnt/datasets/multimodal/ConceptualCaptions/Train_GCC-training_output.csv"
img_dir = "./data/img"#"/mnt/datasets/multimodal/ConceptualCaptions"

trainer = Trainer(
env_type="deepspeed",
experiment_name="clip",
batch_size=64,
num_gpus=2,
fp16=True,
gradient_accumulation_steps=1,
lr=1e-4,
weight_decay=1e-5,
epochs=5,
log_interval=1,
load_dir=None,
pytorch_device=device,
save_dir="clip_deepspeed",
save_interval=1000,
num_checkpoints=1,
hostfile="./deepspeed/hostfile",
training_script=__file__,
deepspeed_config="./deepspeed.json"
)
loader = AutoLoader(task_name="txt_img_matching",#contrastive learning
model_name="clip-base-p32-224",
)
model = loader.get_model()
tokenizer = loader.get_tokenizer()

transform = clip_transform(img_size=model.image_size)
train_dataset = CsvDataset(data_path,
img_dir,
transform,
tokenizer)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
trainer.train(model,
optimizer=optimizer,
train_dataset=train_dataset,
collate_fn=collate_fn)

7 changes: 4 additions & 3 deletions examples/glm_blank_filling/glm_generate_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@
import torch

from flagai.model.glm_model import GLMModel
from flagai.data.tokenizer import GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.model.predictor.predictor import Predictor
if __name__ == "__main__":
"""Main training program."""
print('Generate Samples')
# Random seeds for reproducability.
# Model,
model = GLMModel.from_pretrain(model_name='GLM-large-ch',
model_name = 'GLM-large-ch'
model = GLMModel.from_pretrain(model_name=model_name,
download_path="./state_dict/")
tokenizer = GLMLargeChTokenizer()
tokenizer = Tokenizer.from_pretrained(model_name)

model.cuda(torch.cuda.current_device())

Expand Down
2 changes: 1 addition & 1 deletion examples/glm_poetry_generation/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def __call__(self, batch):
train_src, train_tgt = read_file()
print('-----------train data length:', len(train_src))
my_collate_fn = GLMPoetryDynamicCollateFN(
pad_id=tokenizer.get_command('pad').Id)
pad_id=tokenizer.get_command_id('pad'))
train_dataset = BertSeq2seqDataset(train_src, train_tgt)

trainer.train(model, train_dataset=train_dataset, collate_fn=my_collate_fn)
13 changes: 5 additions & 8 deletions examples/glm_pretrain/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# Licensed under the Apache License, Version 2.0 (the "License")

from flagai.data.tokenizer import GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.model.glm_model import GLMForSeq2Seq
from flagai.trainer import Trainer
from flagai.data.dataset import ConstructBlockStrategy
Expand All @@ -24,14 +24,11 @@
load_dir=None,
lr=1e-4,
save_interval=10)

model = GLMForSeq2Seq.from_pretrain(model_name='GLM-large-ch')

model_name = 'GLM-large-ch'
tokenizer = Tokenizer.from_pretrained(model_name)
ds_args = PretrainDatasetArguments()

tokenizer = GLMLargeChTokenizer()

ds_args = add_args(ds_args, tokenizer)
model = GLMForSeq2Seq.from_pretrain(model_name=model_name)

def create_dataset(tokenizer, should_split):
dataset = get_dataset_lazy("./examples/glm_pretrain/data",
Expand Down Expand Up @@ -59,7 +56,7 @@ def create_dataset(tokenizer, should_split):
collate_fn = None
if ds_args.block_lm:
collate_fn = ConstructBlockStrategy(
tokenizer, 512, eod_token=tokenizer.get_command('eos').Id)
tokenizer, 512, eod_token=tokenizer.get_command_id('eos'))
metric_methods = DEFAULT_METRICS['pretrain']
trainer.train(model,
collate_fn=collate_fn,
Expand Down
6 changes: 3 additions & 3 deletions examples/glm_seq2seq/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSeq2Seq
from flagai.data.tokenizer import GLMLargeEnWordPieceTokenizer, GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.data.dataset import Seq2SeqDataset
from flagai.test_utils import Seq2SeqCollateArguments
from flagai.data.dataset.superglue.control import DEFAULT_METRICS, CH_TASKS
Expand All @@ -27,12 +27,12 @@
print("downloading...")

if task_name in CH_TASKS:
tokenizer = GLMLargeChTokenizer()
model_name = 'GLM-large-ch'
else:
tokenizer = GLMLargeEnWordPieceTokenizer()
model_name = 'GLM-large-en'

tokenizer = Tokenizer.from_pretrained(model_name)

train_dataset = Seq2SeqDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
5 changes: 3 additions & 2 deletions examples/glm_superglue/train_10b_clue.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLMLargeChTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -21,11 +21,12 @@
save_dir="./glm_superglue_en",
save_interval=1)

model_name = "GLM-large-ch"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-ch")


tokenizer = GLMLargeChTokenizer()
tokenizer = Tokenizer.from_pretrained("GLM-large-ch")
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
8 changes: 4 additions & 4 deletions examples/glm_superglue/train_10b_superglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze
from flagai.data.tokenizer import GLM10bENBPETokenizer, GLMLargeEnWordPieceTokenizer
from flagai.data.tokenizer import Tokenizer
from flagai.metrics import accuracy_metric
from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
Expand All @@ -28,11 +28,11 @@
# deepspeed_config='./deepspeed.json',
# training_script=__file__)

model_name = "GLM-large-en"
model = GLMForSingleTokenCloze.from_pretrain(download_path="/mnt/test_10b_models",
model_name="GLM-large-en")
model_name=model_name)

tokenizer = GLMLargeEnWordPieceTokenizer()

tokenizer = Tokenizer.from_pretrained(model_name)
train_dataset = SuperGlueDataset(task_name=task_name,
data_dir='./datasets/',
dataset_type='train',
Expand Down
12 changes: 4 additions & 8 deletions examples/glm_superglue/train_prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
#
# Licensed under the Apache License, Version 2.0 (the "License")
from flagai.trainer import Trainer
from flagai.model.glm_model import GLMForSingleTokenCloze, GLMForMultiTokenCloze, GLMForMultiTokenClozeFast, GLMForSequenceClassification
from flagai.data.tokenizer import GLMLargeEnWordPieceTokenizer, GLMLargeChTokenizer
from flagai.model.glm_model import GLMForSequenceClassification
from flagai.data.tokenizer import Tokenizer

from flagai.data.dataset import SuperGlueDataset
from flagai.test_utils import CollateArguments
from flagai.data.dataset.superglue.control import DEFAULT_METRICS, MULTI_TOKEN_TASKS, CH_TASKS
import unittest
from flagai.data.dataset import ConstructSuperglueStrategy


Expand All @@ -32,13 +31,10 @@

if task_name in CH_TASKS:
model_name = 'GLM-large-ch'
tokenizer = GLMLargeChTokenizer(add_block_symbols=True,
add_task_mask=False,
add_decoder_mask=False,
fix_command_token=True)
add_block_symbols=True,
else:
model_name = 'GLM-large-en'
tokenizer = GLMLargeEnWordPieceTokenizer()
tokenizer = Tokenizer.from_pretrained(model_name)

model = GLMForSequenceClassification.from_pretrain(model_name=model_name, spell_length=2,
class_num=3, tune_prefix_layers=1)
Expand Down
Loading

0 comments on commit dee25b7

Please sign in to comment.