Skip to content

Commit

Permalink
update bert and optimizers
Browse files Browse the repository at this point in the history
  • Loading branch information
Lupin1998 committed Dec 19, 2022
1 parent 173bcce commit 40e1ce9
Show file tree
Hide file tree
Showing 10 changed files with 660 additions and 139 deletions.
37 changes: 37 additions & 0 deletions configs/selfsup/_base_/datasets/gRNA/K562_pretrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# dataset settings
data_root = 'data/on_target_K562/train/'
data_source_cfg = dict(
type='BioSeqDataset',
file_list=None, # use all splits
word_splitor="", data_splitor="\t", mapping_name="ACGT", # gRNA tokenize
has_labels=True, return_label=False, # pre-training
max_data_length=int(1e7),
data_type="regression",
)

dataset_type = 'ExtractDataset'
sample_norm_cfg = dict(mean=[0,], std=[1,])
train_pipeline = [
dict(type='ToTensor'),
]
test_pipeline = [
dict(type='ToTensor'),
]
# prefetch
prefetch = False

data = dict(
samples_per_gpu=256,
workers_per_gpu=4,
drop_last=True,
train=dict(
type=dataset_type,
data_source=dict(
root=data_root, **data_source_cfg),
pipeline=train_pipeline,
prefetch=prefetch,
),
)

# checkpoint
checkpoint_config = dict(interval=200, max_keep_ckpts=1)
9 changes: 5 additions & 4 deletions configs/selfsup/_base_/datasets/gRNA/gRNA_pretrain.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# dataset settings
data_root = 'data/on_target_K562/'
data_root = 'data/gRNA_pretrain/'
data_source_cfg = dict(
type='BioSeqDataset',
file_list=None, # use all splits
word_splitor="", data_splitor="\t", mapping_name="ACGT", # gRNA tokenize
has_labels=False, return_label=False, # pre-training
max_data_length=int(1e7),
data_type="regression",
)

dataset_type = 'RegressionDataset'
dataset_type = 'ExtractDataset'
sample_norm_cfg = dict(mean=[0,], std=[1,])
train_pipeline = [
dict(type='ToTensor'),
Expand All @@ -25,8 +27,7 @@
train=dict(
type=dataset_type,
data_source=dict(
root=data_root+"train",
**data_source_cfg),
root=data_root, **data_source_cfg),
pipeline=train_pipeline,
prefetch=prefetch,
),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
_base_ = [
'../../../_base_/datasets/gRNA/gRNA_pretrain.py',
'../../../_base_/default_runtime.py',
]

embed_dim = 64
patch_size = 2
seq_len = 63

# model settings
model = dict(
type='BERT',
pretrained=None,
mask_ratio=0.15, # BERT 15%
spin_stride=[1, 2, 4],
backbone=dict(
type='SimMIMTransformer',
arch=dict(
embed_dims=embed_dim,
num_layers=4,
num_heads=4,
feedforward_channels=embed_dim * 4,
),
in_channels=4,
patch_size=patch_size,
seq_len=int(seq_len / patch_size) + bool(seq_len % patch_size != 0),
mask_layer=0,
mask_ratio=0.15, # BERT 15%
mask_token='learnable',
# mask_token='zero',
norm_cfg=dict(type='LN', eps=1e-6),
drop_rate=0., # no dropout for pre-training
drop_path_rate=0.1,
final_norm=True,
out_indices=-1, # last layer
with_cls_token=True,
output_cls_token=True,
),
neck=dict(
type='BERTMLMNeck', feature_Nd="1d",
in_channels=embed_dim, out_channels=4, encoder_stride=patch_size),
head=dict(
type='MIMHead',
loss=dict(type='CrossEntropyLoss',
use_soft=True, use_sigmoid=False, loss_weight=1.0),
feature_Nd="1d", unmask_weight=0., encoder_in_channels=4,
),
init_cfg=[
dict(type='TruncNormal', layer=['Conv1d', 'Linear'], std=0.02, bias=0.),
dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.)
],
)

# dataset
data = dict(samples_per_gpu=256, workers_per_gpu=4)

# optimizer
optimizer = dict(
type='AdamW',
lr=1e-3,
weight_decay=1e-2, eps=1e-8, betas=(0.9, 0.999),
paramwise_options={
'(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
'norm': dict(weight_decay=0.),
'bias': dict(weight_decay=0.),
'cls_token': dict(weight_decay=0.),
'pos_embed': dict(weight_decay=0.),
'mask_token': dict(weight_decay=0.),
})

# apex
use_fp16 = False
fp16 = dict(type='mmcv', loss_scale=dict(mode='dynamic'))
optimizer_config = dict(
grad_clip=dict(max_norm=1000.0), update_interval=1)

# learning policy
lr_config = dict(
policy='CosineAnnealing',
by_epoch=False, min_lr=1e-5,
warmup='linear',
warmup_iters=5, warmup_by_epoch=True,
warmup_ratio=1e-5,
)

# checkpoint
checkpoint_config = dict(interval=200, max_keep_ckpts=1)

# runtime settings
runner = dict(type='EpochBasedRunner', max_epochs=100)
3 changes: 2 additions & 1 deletion docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@

#### Highlight
* Support various popular backbones (ConvNets and ViTs), various image datasets, popular mixup methods, and benchmarks for supervised learning. Config files are available.
* Support popular self-supervised methods (e.g., BYOL, MoCo.V3, MAE) on both large-scale and small-scale datasets, and self-supervised benchmarks (merged from MMSelfSup). Config files are available.
* Support popular self-supervised methods (e.g., BYOL, MoCo.V3, MAE) on both large-scale and small-scale datasets, and self-supervised benchmarks (merged from MMSelfSup). Config files are available. Support BERT pre-training method and update config files.
* Support analyzing tools for self-supervised learning (kNN/SVM/linear metrics and t-SNE/UMAP visualization).
* Convenient usage of configs: fast configs generation by 'auto_train.py' and configs inheriting (MMCV).
* Support mixed-precision training (NVIDIA Apex or MMCV Apex).
* Refactor `openbioseq.core` and support Adan optimizer.

#### Bug Fixes
* Done code refactoring follows MMSelfSup and MMClassification.
Expand Down
6 changes: 4 additions & 2 deletions openbioseq/core/optimizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from .adan import Adan
from .builder import build_optimizer
from .constructor import DefaultOptimizerConstructor, TransformerFinetuneConstructor
from .optimizers import LARS, LAMB
from .lamb import LAMB
from .lars import LARS

__all__ = [
'LARS', 'LAMB', 'build_optimizer',
'Adan', 'LARS', 'LAMB', 'build_optimizer',
'DefaultOptimizerConstructor', 'TransformerFinetuneConstructor'
]
Loading

0 comments on commit 40e1ce9

Please sign in to comment.