Skip to content

11 implement moe #116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 37 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
3585aed
feat: Added initial MoE support
mali-git Apr 8, 2024
6e11dd0
refactor(moe): Minor refactorings.
BlueCrescent Apr 8, 2024
8bf4e14
test(moe): Added initial tests for MoE modules.
BlueCrescent Apr 8, 2024
e05f08f
refactor(moe): Minor refactorings.
BlueCrescent Apr 15, 2024
dcd8cd3
feat(moe): Added MoE to GPT2 config.
BlueCrescent Apr 15, 2024
48136e8
feat: refactor and add configs
Apr 15, 2024
6bb1e3f
feat: add configs
Apr 18, 2024
68ef13f
fix(moe): Fixed some pydantic errors.
BlueCrescent Apr 22, 2024
3559058
fix(moe): Added missing MoE component imports.
BlueCrescent Apr 22, 2024
b6c9bce
fix(moe): Moved validators to correct BaseModel.
BlueCrescent Apr 22, 2024
cade4fb
feat: Updated the config to Add support for MoEBlock
Apr 22, 2024
e75605c
chore: Merge branch 'main' into 11-implement-moe
AbasKhan Apr 22, 2024
80bfd9b
refactor(config): Streamlined handling of empty config entries.
BlueCrescent Apr 29, 2024
26b8970
chore(config): Separated MoE config from standard lorem ipsum config.
BlueCrescent Apr 29, 2024
20b1dd2
chore: Merge remote-tracking branch 'origin/main' into 11-implement-moe
BlueCrescent Apr 29, 2024
077b846
chore(config): Updated checkpointing config for MoE.
BlueCrescent Apr 29, 2024
a685555
test(moe): Finished MoE tests.
BlueCrescent Apr 29, 2024
aecf02a
feat(config): Updated lorem ipsum configs to new gpt2 block style.
BlueCrescent Apr 29, 2024
ff6960c
fix: Added fixes to lorem ipsum configs
AbasKhan Apr 29, 2024
48faada
chore: Added minor fixes in the code and added config for MoE small c…
AbasKhan May 8, 2024
148c0ad
chore: Merge remote-tracking branch 'origin/main' into 11-implement-moe
AbasKhan May 8, 2024
46b5337
fix: Updated relevant config files to conform with the changes introd…
AbasKhan May 9, 2024
9d9ae1e
chore: Merge branch 'main' into 11-implement-moe
thomaschhh May 16, 2024
996c3d1
fix: remove duplicate
thomaschhh May 16, 2024
210a480
fix: adapt gradient_clipper to config_lorem_ipsum.yaml
thomaschhh May 16, 2024
f093593
fix: update evaluation subscriber
thomaschhh May 16, 2024
2052c92
fix(moe): Added weight initialization for MoEExpertGLU.
BlueCrescent May 23, 2024
c251d7b
feat: merged with main
AbasKhan May 27, 2024
a85dc69
fix: Added minor fixes to the config files and updated exception hand…
AbasKhan May 28, 2024
c50f497
chore: Merge branch 'main' into 11-implement-moe
AbasKhan Jun 3, 2024
6e69aa3
fix: Removed vocab size from validate_sizes, and added/updated releva…
AbasKhan Jun 3, 2024
288c0a2
fix: reverted changes made to toml file
AbasKhan Jun 4, 2024
a971c44
fix: Resolved conflicts
AbasKhan Jun 17, 2024
113b245
fix: Added fixes to the config files
AbasKhan Jun 17, 2024
4c9ca63
fix: removed redundant try/catch
AbasKhan Jun 17, 2024
2ef2898
fix: Added fix for rotary embedding check
AbasKhan Jun 17, 2024
e1f22b3
fix: Removed activation_type check from gpt2block
AbasKhan Jun 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions config_files/text_generation/text_generation_overfitted_de_moe.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
settings:
referencing_keys:
sample_key: input_ids
prediction_key: logits
model_path: /raid/s3/opengptx/akhan/modalities/modalities/data/checkpoints/2024-06-03__15-18-50/eid_2024-06-03__15-18-50-model-num_steps_384.bin
device: 0
context_length: 2048
training:
global_training_log_interval_in_steps: 1
global_checkpointing_interval_in_steps: 128
global_evaluation_interval_in_steps: 64
global_num_seen_steps: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 16
sequence_length: 2048
gradient_clipping:
mode: p2_norm
threshold: 1.0
cuda_env:
local_rank: "0"
global_rank: "0"
world_size: "1"
paths:
checkpointing_path: data/checkpoints

text_inference_component:
component_key: inference_component
variant_key: text
config:
device: ${settings.device}
model:
instance_key: checkpointed_model
pass_type: BY_REFERENCE
tokenizer:
instance_key: tokenizer
pass_type: BY_REFERENCE
context_length: ${settings.context_length}
eod_token: <eod>
prompt_template: "{prompt_input}" # "<instruction> Du bist Moody, ein LLM welches Menschen helfen soll. user: {prompt_input}"
temperature: 0
# chat: false

checkpointed_model:
component_key: model
variant_key: checkpointed
config:
checkpoint_loading:
component_key: checkpoint_loading
variant_key: torch
config:
device: ${settings.device}
precision: BF16
model:
instance_key: raw_model
pass_type: BY_REFERENCE
checkpoint_path: ${settings.model_path}

raw_model:
component_key: model
variant_key: gpt2
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: NOPE
block_size: ${settings.training.sequence_length}
prediction_key: ${settings.referencing_keys.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 12
n_head_q: 12
n_embd: 768
dropout: 0.0
bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
config:
n_embd: ${raw_model.config.n_embd}
n_head: ${raw_model.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
activation_type: gelu
weight_init:
mean: 0.0
std: 0.02
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5

lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
gpt2block:
component_key: block
variant_key: moe_block
config:
block_size: ${settings.training.sequence_length}
n_embd: ${raw_model.config.n_embd}
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
attention_config: ${raw_model.config.attention_config}
dropout: ${raw_model.config.dropout}
ffn_hidden: 2048
bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
n_head_q: ${raw_model.config.n_head_q}
n_head_kv: 12
moe_num_experts: 4
moe_top_k: 2
moe_normalize_expert_weights: 0.1
uniform_expert_assignment: false
moe_jitter_eps: 0.1
moe_act_fn:
component_key: moe_act_fn
variant_key: silu
config:

tokenizer:
component_key: tokenizer
variant_key: pretrained_hf_tokenizer
config:
pretrained_model_name_or_path: /raid/s3/opengptx/max_lue/modalities/data/tokenizer/hf_gpt2
padding: false
max_length: ${settings.context_length}
44 changes: 31 additions & 13 deletions config_files/training/config_example_mem_map_dataset.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
Expand Down Expand Up @@ -126,7 +127,7 @@ checkpointing:
experiment_id: ${settings.experiment_id}
mixed_precision_settings: BF_16
sharding_strategy: FULL_SHARD
block_names: [GPT2Block]
block_names: [TransformerBlock]

wrapped_model:
component_key: model
Expand All @@ -138,7 +139,7 @@ wrapped_model:
sync_module_states: true
mixed_precision_settings: BF_16
sharding_strategy: FULL_SHARD
block_names: [GPT2Block]
block_names: [TransformerBlock]

model:
component_key: model
Expand All @@ -151,11 +152,8 @@ model:
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 12
n_head_q: 12
n_head_kv: 12
ffn_hidden: 2048
n_embd: 768
dropout: 0.0
bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
Expand All @@ -174,20 +172,39 @@ model:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
gpt2block:
component_key: block
variant_key: transformer_block
config:
block_size: ${settings.training.sequence_length}
n_embd: ${model.config.n_embd}
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
attention_config: ${model.config.attention_config}
dropout: ${model.config.dropout}
bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
n_head_q: ${model.config.n_head_q}
n_head_kv: 12
ffn_hidden: 2048

loss_fn:
component_key: loss
Expand Down Expand Up @@ -238,4 +255,5 @@ evaluation_subscriber:
project: modalities
mode: ONLINE
experiment_id: ${settings.experiment_id}
directory: "."
directory: "."
config_file_path: ${settings.config_file_path}
69 changes: 39 additions & 30 deletions config_files/training/config_gpt2_small_overfitting_de.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
Expand Down Expand Up @@ -39,7 +40,7 @@ val_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_en.pbin
raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_de.pbin
block_size: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

Expand Down Expand Up @@ -143,7 +144,7 @@ wrapped_model:
sync_module_states: true
mixed_precision_settings: BF_16
sharding_strategy: FULL_SHARD
block_names: [GPT2Block]
block_names: [TransformerBlock]

model:
component_key: model
Expand All @@ -156,44 +157,52 @@ model:
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 12
n_head_q: 12
n_head_kv: 12
ffn_hidden: 2048
n_embd: 768
dropout: 0.0
bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
config:
n_embd: ${model.config.n_embd}
n_head: ${model.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
activation_type: gelu
weight_init:
mean: 0.0
std: 0.02
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5

gpt2block:
component_key: block
variant_key: transformer_block
config:
block_size: ${settings.training.sequence_length}
n_embd: ${model.config.n_embd}
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
activation_type: gelu
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
config:
n_embd: ${model.config.n_embd}
n_head: ${model.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
dropout: ${model.config.dropout}
ffn_hidden: 2048
bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
n_head_q: ${model.config.n_head_q}
n_head_kv: 12

loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
Expand Down Expand Up @@ -237,7 +246,6 @@ batch_progress_subscriber:
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
train_dataloader:
instance_key: train_dataloader
Expand All @@ -255,4 +263,5 @@ evaluation_subscriber:
project: modalities
mode: ONLINE
experiment_id: ${settings.experiment_id}
directory: "."
directory: "."
config_file_path: ${settings.config_file_path}
Loading