diff --git a/config_files/text_generation/text_generation_overfitted_de_moe.yaml b/config_files/text_generation/text_generation_overfitted_de_moe.yaml new file mode 100644 index 000000000..d31ce04a2 --- /dev/null +++ b/config_files/text_generation/text_generation_overfitted_de_moe.yaml @@ -0,0 +1,141 @@ +settings: + referencing_keys: + sample_key: input_ids + prediction_key: logits + model_path: /raid/s3/opengptx/akhan/modalities/modalities/data/checkpoints/2024-06-03__15-18-50/eid_2024-06-03__15-18-50-model-num_steps_384.bin + device: 0 + context_length: 2048 + training: + global_training_log_interval_in_steps: 1 + global_checkpointing_interval_in_steps: 128 + global_evaluation_interval_in_steps: 64 + global_num_seen_steps: 0 + do_apply_activation_checkpointing: false + gradient_acc_steps: 1 + local_train_micro_batch_size: 16 + sequence_length: 2048 + gradient_clipping: + mode: p2_norm + threshold: 1.0 + cuda_env: + local_rank: "0" + global_rank: "0" + world_size: "1" + paths: + checkpointing_path: data/checkpoints + +text_inference_component: + component_key: inference_component + variant_key: text + config: + device: ${settings.device} + model: + instance_key: checkpointed_model + pass_type: BY_REFERENCE + tokenizer: + instance_key: tokenizer + pass_type: BY_REFERENCE + context_length: ${settings.context_length} + eod_token: + prompt_template: "{prompt_input}" # " Du bist Moody, ein LLM welches Menschen helfen soll. user: {prompt_input}" + temperature: 0 + # chat: false + +checkpointed_model: + component_key: model + variant_key: checkpointed + config: + checkpoint_loading: + component_key: checkpoint_loading + variant_key: torch + config: + device: ${settings.device} + precision: BF16 + model: + instance_key: raw_model + pass_type: BY_REFERENCE + checkpoint_path: ${settings.model_path} + +raw_model: + component_key: model + variant_key: gpt2 + config: + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + block_size: ${settings.training.sequence_length} + prediction_key: ${settings.referencing_keys.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 12 + n_head_q: 12 + n_embd: 768 + dropout: 0.0 + bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${raw_model.config.n_embd} + n_head: ${raw_model.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + activation_type: gelu + weight_init: + mean: 0.0 + std: 0.02 + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${raw_model.config.n_embd} + bias: true + epsilon: 1e-5 + + lm_head_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${raw_model.config.n_embd} + bias: true + epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: moe_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${raw_model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${raw_model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${raw_model.config.n_embd} + bias: true + epsilon: 1e-5 + attention_config: ${raw_model.config.attention_config} + dropout: ${raw_model.config.dropout} + ffn_hidden: 2048 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + n_head_q: ${raw_model.config.n_head_q} + n_head_kv: 12 + moe_num_experts: 4 + moe_top_k: 2 + moe_normalize_expert_weights: 0.1 + uniform_expert_assignment: false + moe_jitter_eps: 0.1 + moe_act_fn: + component_key: moe_act_fn + variant_key: silu + config: + +tokenizer: + component_key: tokenizer + variant_key: pretrained_hf_tokenizer + config: + pretrained_model_name_or_path: /raid/s3/opengptx/max_lue/modalities/data/tokenizer/hf_gpt2 + padding: false + max_length: ${settings.context_length} \ No newline at end of file diff --git a/config_files/training/config_example_mem_map_dataset.yaml b/config_files/training/config_example_mem_map_dataset.yaml index 4e1cf9f24..631d9e298 100644 --- a/config_files/training/config_example_mem_map_dataset.yaml +++ b/config_files/training/config_example_mem_map_dataset.yaml @@ -1,5 +1,6 @@ settings: experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} referencing_keys: sample_key: input_ids target_key: target_ids @@ -126,7 +127,7 @@ checkpointing: experiment_id: ${settings.experiment_id} mixed_precision_settings: BF_16 sharding_strategy: FULL_SHARD - block_names: [GPT2Block] + block_names: [TransformerBlock] wrapped_model: component_key: model @@ -138,7 +139,7 @@ wrapped_model: sync_module_states: true mixed_precision_settings: BF_16 sharding_strategy: FULL_SHARD - block_names: [GPT2Block] + block_names: [TransformerBlock] model: component_key: model @@ -151,11 +152,8 @@ model: vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: 12 n_head_q: 12 - n_head_kv: 12 - ffn_hidden: 2048 n_embd: 768 dropout: 0.0 - bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster attention_config: qkv_transforms: - type_hint: RotaryTransform @@ -174,13 +172,6 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 - ffn_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 lm_head_norm: component_key: layer_norm variant_key: rms_norm @@ -188,6 +179,32 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: transformer_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + attention_config: ${model.config.attention_config} + dropout: ${model.config.dropout} + bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 12 + ffn_hidden: 2048 loss_fn: component_key: loss @@ -238,4 +255,5 @@ evaluation_subscriber: project: modalities mode: ONLINE experiment_id: ${settings.experiment_id} - directory: "." \ No newline at end of file + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/config_gpt2_small_overfitting_de.yaml b/config_files/training/config_gpt2_small_overfitting_de.yaml index b56c08ace..8da4d425f 100644 --- a/config_files/training/config_gpt2_small_overfitting_de.yaml +++ b/config_files/training/config_gpt2_small_overfitting_de.yaml @@ -1,5 +1,6 @@ settings: experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} referencing_keys: sample_key: input_ids target_key: target_ids @@ -39,7 +40,7 @@ val_dataset: component_key: dataset variant_key: packed_mem_map_dataset_continuous config: - raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_en.pbin + raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_de.pbin block_size: ${settings.training.sequence_length} sample_key: ${settings.referencing_keys.sample_key} @@ -143,7 +144,7 @@ wrapped_model: sync_module_states: true mixed_precision_settings: BF_16 sharding_strategy: FULL_SHARD - block_names: [GPT2Block] + block_names: [TransformerBlock] model: component_key: model @@ -156,36 +157,11 @@ model: vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: 12 n_head_q: 12 - n_head_kv: 12 - ffn_hidden: 2048 n_embd: 768 dropout: 0.0 - bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster - attention_config: - qkv_transforms: - - type_hint: RotaryTransform - config: - n_embd: ${model.config.n_embd} - n_head: ${model.config.n_head_q} #it has to be head_q here - seq_length_dim: -2 - activation_type: gelu weight_init: mean: 0.0 std: 0.02 - attention_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 - ffn_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 lm_head_norm: component_key: layer_norm variant_key: rms_norm @@ -193,7 +169,40 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 - + gpt2block: + component_key: block + variant_key: transformer_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + activation_type: gelu + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model.config.n_embd} + n_head: ${model.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + dropout: ${model.config.dropout} + ffn_hidden: 2048 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 12 + loss_fn: component_key: loss variant_key: clm_cross_entropy_loss @@ -237,7 +246,6 @@ batch_progress_subscriber: variant_key: rich config: local_rank: ${settings.cuda_env.local_rank} - world_size: ${settings.cuda_env.world_size} global_num_seen_steps: ${settings.training.global_num_seen_steps} train_dataloader: instance_key: train_dataloader @@ -255,4 +263,5 @@ evaluation_subscriber: project: modalities mode: ONLINE experiment_id: ${settings.experiment_id} - directory: "." \ No newline at end of file + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/config_gpt2_small_overfitting_de_abs_pos_emb.yaml b/config_files/training/config_gpt2_small_overfitting_de_abs_pos_emb.yaml index 7510ec23b..46e1ed245 100644 --- a/config_files/training/config_gpt2_small_overfitting_de_abs_pos_emb.yaml +++ b/config_files/training/config_gpt2_small_overfitting_de_abs_pos_emb.yaml @@ -1,5 +1,6 @@ settings: experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} referencing_keys: sample_key: input_ids target_key: target_ids @@ -134,7 +135,7 @@ checkpointing: experiment_id: ${settings.experiment_id} mixed_precision_settings: BF_16 sharding_strategy: FULL_SHARD - block_names: [GPT2Block] + block_names: [TransformerBlock] wrapped_model: component_key: model @@ -146,7 +147,7 @@ wrapped_model: sync_module_states: true mixed_precision_settings: BF_16 sharding_strategy: FULL_SHARD - block_names: [GPT2Block] + block_names: [TransformerBlock] model: component_key: model @@ -159,11 +160,8 @@ model: vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: 12 n_head_q: 12 - n_head_kv: 12 - ffn_hidden: 2048 n_embd: 768 dropout: 0.0 - bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster attention_config: qkv_transforms: - type_hint: IdentityTransform @@ -179,13 +177,6 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 - ffn_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 lm_head_norm: component_key: layer_norm variant_key: rms_norm @@ -193,6 +184,32 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: transformer_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + attention_config: ${model.config.attention_config} + dropout: ${model.config.dropout} + bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 12 + ffn_hidden: 2048 loss_fn: component_key: loss @@ -253,6 +270,7 @@ evaluation_subscriber: config: local_rank: ${settings.cuda_env.local_rank} project: modalities - mode: ONLINE + mode: OFFLINE experiment_id: ${settings.experiment_id} - directory: "." \ No newline at end of file + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/config_gpt2_small_redpajama_DE_1048576.yaml b/config_files/training/config_gpt2_small_redpajama_DE_1048576.yaml index bf5afc6f1..1feca1dde 100644 --- a/config_files/training/config_gpt2_small_redpajama_DE_1048576.yaml +++ b/config_files/training/config_gpt2_small_redpajama_DE_1048576.yaml @@ -1,5 +1,6 @@ settings: experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} referencing_keys: sample_key: input_ids target_key: target_ids @@ -142,7 +143,7 @@ wrapped_model: sync_module_states: true mixed_precision_settings: BF_16 sharding_strategy: FULL_SHARD - block_names: [GPT2Block] + block_names: [TransformerBlock] model: component_key: model @@ -155,8 +156,6 @@ model: vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: 12 n_head_q: 12 - n_head_kv: 12 - ffn_hidden: 2048 n_embd: 768 dropout: 0.0 bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster @@ -178,13 +177,6 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 - ffn_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 lm_head_norm: component_key: layer_norm variant_key: rms_norm @@ -192,6 +184,32 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: transformer_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + attention_config: ${model.config.attention_config} + dropout: ${model.config.dropout} + ffn_hidden: 2048 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 12 loss_fn: component_key: loss @@ -255,4 +273,5 @@ evaluation_subscriber: project: modalities mode: ONLINE experiment_id: ${settings.experiment_id} - directory: "." \ No newline at end of file + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/config_lorem_ipsum.yaml b/config_files/training/config_lorem_ipsum.yaml index 864503a77..eb90cace0 100644 --- a/config_files/training/config_lorem_ipsum.yaml +++ b/config_files/training/config_lorem_ipsum.yaml @@ -169,7 +169,7 @@ wrapped_model: sync_module_states: true mixed_precision_settings: BF_16 sharding_strategy: FULL_SHARD - block_names: [GPT2Block] + block_names: [TransformerBlock] model: component_key: model @@ -182,36 +182,11 @@ model: vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: 2 n_head_q: 8 - n_head_kv: 4 - ffn_hidden: 128 n_embd: 128 - dropout: 0.0 - bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster - attention_config: - qkv_transforms: - - type_hint: RotaryTransform - config: - n_embd: ${model.config.n_embd} - n_head: ${model.config.n_head_q} #it has to be head_q here - seq_length_dim: -2 - activation_type: gelu + dropout: 0.0 weight_init: mean: 0.0 std: 0.02 - attention_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 - ffn_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 lm_head_norm: component_key: layer_norm variant_key: rms_norm @@ -219,6 +194,39 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: transformer_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + activation_type: gelu + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model.config.n_embd} + n_head: ${model.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + dropout: ${model.config.dropout} + ffn_hidden: 128 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 8 scheduler: component_key: scheduler @@ -230,7 +238,7 @@ scheduler: max_lr: 6e-4 div_factor: 10 final_div_factor: 1 - total_steps: 16 + total_steps: 32 pct_start: 0.01 anneal_strategy: cos diff --git a/config_files/training/moe_configs/config_example_mem_map_dataset_moe.yaml b/config_files/training/moe_configs/config_example_mem_map_dataset_moe.yaml new file mode 100644 index 000000000..34f9825b6 --- /dev/null +++ b/config_files/training/moe_configs/config_example_mem_map_dataset_moe.yaml @@ -0,0 +1,268 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + training: + callback_interval_in_samples: 32768 + global_num_training_samples: 2048 + global_num_seen_samples: 0 + do_apply_activation_checkpointing: false + gradient_acc_steps: 1 + local_train_micro_batch_size: 16 + sequence_length: 2048 + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpointing_path: data/checkpoints + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/redpajama_v2/mem_map/redpajama_v2_default_DE_num_docs_1048576/redpyjama_v2_default_DE_num_docs_1048576.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: "train" + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +val_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/redpajama_v2/mem_map/redpyjama_v2_default_DE_num_docs_1024/redpyjama_v2_default_DE_num_docs_1024.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +val_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: "val" + dataset: + instance_key: val_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + dataset: + instance_key: val_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + +checkpointing: + component_key: checkpointing + variant_key: default + config: + checkpointing_strategy: + component_key: checkpointing_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpointing_execution: + component_key: checkpointing_execution + variant_key: fsdp_to_disc_checkpointing + config: + checkpoint_path: ${settings.paths.checkpointing_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + mixed_precision_settings: BF_16 + sharding_strategy: FULL_SHARD + block_names: [MoEBlock] + +wrapped_model: + component_key: model + variant_key: fsdp_wrapped + config: + model: + instance_key: model + pass_type: BY_REFERENCE + sync_module_states: true + mixed_precision_settings: BF_16 + sharding_strategy: FULL_SHARD + block_names: [MoEBlock] + +model: + component_key: model + variant_key: gpt2 + config: + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + block_size: ${settings.training.sequence_length} + prediction_key: ${settings.referencing_keys.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 12 + n_head_q: 12 + n_embd: 768 + dropout: 0.0 + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model.config.n_embd} + n_head: ${model.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + activation_type: gelu + weight_init: + mean: 0.0 + std: 0.02 + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + lm_head_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: moe_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + attention_config: ${model.config.attention_config} + dropout: ${model.config.dropout} + bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 12 + ffn_hidden: 2048 + moe_num_experts: 4 + moe_top_k: 2 + moe_normalize_expert_weights: 0.1 + uniform_expert_assignment: false + moe_jitter_eps: 0.1 + moe_act_fn: + component_key: moe_act_fn + variant_key: silu + config: + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp + config: + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + +batch_progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + local_rank: ${settings.cuda_env.local_rank} + world_size: ${settings.cuda_env.world_size} + global_num_seen_samples: ${settings.training.global_num_seen_samples} + train_dataloader: + instance_key: train_dataloader + pass_type: BY_REFERENCE + eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + local_rank: ${settings.cuda_env.local_rank} + project: modalities + mode: ONLINE + experiment_id: ${settings.experiment_id} + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/moe_configs/config_gpt2_small_overfitting_de_abs_pos_emb_moe.yaml b/config_files/training/moe_configs/config_gpt2_small_overfitting_de_abs_pos_emb_moe.yaml new file mode 100644 index 000000000..5e919bdcf --- /dev/null +++ b/config_files/training/moe_configs/config_gpt2_small_overfitting_de_abs_pos_emb_moe.yaml @@ -0,0 +1,285 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + training: + global_training_log_interval_in_steps: 1 + global_checkpointing_interval_in_steps: 128 + global_evaluation_interval_in_steps: 64 + global_num_seen_steps: 0 + do_apply_activation_checkpointing: false + gradient_acc_steps: 1 + local_train_micro_batch_size: 16 + sequence_length: 2048 + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpointing_path: data/checkpoints + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /workspaces/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_de.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +val_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /workspaces/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_en.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: repeating_data_loader + config: + reshuffle_after_epoch: false + num_epochs: 100 + dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +val_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: val + dataset: + instance_key: val_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + dataset: + instance_key: val_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + + +checkpoint_saving: + component_key: checkpointing + variant_key: default + config: + checkpointing_strategy: + component_key: checkpointing_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpointing_execution: + component_key: checkpointing_execution + variant_key: fsdp_to_disc_checkpointing + config: + checkpoint_path: ${settings.paths.checkpointing_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + mixed_precision_settings: BF_16 + sharding_strategy: FULL_SHARD + block_names: [MoEBlock] + +wrapped_model: + component_key: model + variant_key: fsdp_wrapped + config: + model: + instance_key: model + pass_type: BY_REFERENCE + sync_module_states: true + mixed_precision_settings: BF_16 + sharding_strategy: FULL_SHARD + block_names: [MoEBlock] + +model: + component_key: model + variant_key: gpt2 + config: + sample_key: ${settings.referencing_keys.sample_key} + poe_type: ABSOLUTE + block_size: ${settings.training.sequence_length} + prediction_key: ${settings.referencing_keys.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 12 + n_head_q: 12 + n_embd: 768 + dropout: 0.0 + attention_config: + qkv_transforms: + - type_hint: IdentityTransform + config: {} + activation_type: gelu + weight_init: + mean: 0.0 + std: 0.02 + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + lm_head_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: moe_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + attention_config: ${model.config.attention_config} + dropout: ${model.config.dropout} + bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 12 + ffn_hidden: 2048 + moe_num_experts: 4 + moe_top_k: 2 + moe_normalize_expert_weights: 0.1 + uniform_expert_assignment: false + moe_jitter_eps: 0.1 + moe_act_fn: + component_key: moe_act_fn + variant_key: silu + config: + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp + config: + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + +scheduler: + component_key: scheduler + variant_key: dummy_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + + +batch_progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + local_rank: ${settings.cuda_env.local_rank} + world_size: ${settings.cuda_env.world_size} + global_num_seen_steps: ${settings.training.global_num_seen_steps} + train_dataloader: + instance_key: train_dataloader + pass_type: BY_REFERENCE + eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + local_rank: ${settings.cuda_env.local_rank} + project: modalities + mode: ONLINE + experiment_id: ${settings.experiment_id} + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/moe_configs/config_gpt2_small_overfitting_de_moe.yaml b/config_files/training/moe_configs/config_gpt2_small_overfitting_de_moe.yaml new file mode 100644 index 000000000..7496b0fb1 --- /dev/null +++ b/config_files/training/moe_configs/config_gpt2_small_overfitting_de_moe.yaml @@ -0,0 +1,280 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + training: + global_training_log_interval_in_steps: 1 + global_checkpointing_interval_in_steps: 128 + global_evaluation_interval_in_steps: 64 + global_num_seen_steps: 0 + do_apply_activation_checkpointing: false + gradient_acc_steps: 1 + local_train_micro_batch_size: 16 + sequence_length: 2048 + gradient_clipping: + mode: p2_norm + threshold: 1.0 + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpointing_path: data/checkpoints + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_de.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +val_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_de.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: repeating_data_loader + config: + reshuffle_after_epoch: false + num_epochs: 100 + dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +val_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: val + dataset: + instance_key: val_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + dataset: + instance_key: val_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: fsdp + config: + checkpoint_path: ${settings.paths.checkpointing_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +wrapped_model: + component_key: model + variant_key: fsdp_wrapped + config: + model: + instance_key: model + pass_type: BY_REFERENCE + sync_module_states: true + mixed_precision_settings: BF_16 + sharding_strategy: FULL_SHARD + block_names: [MoEBlock] + +model: + component_key: model + variant_key: gpt2 + config: + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + block_size: ${settings.training.sequence_length} + prediction_key: ${settings.referencing_keys.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 12 + n_head_q: 12 + n_embd: 768 + dropout: 0.0 + weight_init: + mean: 0.0 + std: 0.02 + + lm_head_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: moe_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + activation_type: gelu + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model.config.n_embd} + n_head: ${model.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + dropout: ${model.config.dropout} + ffn_hidden: 2048 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 12 + moe_num_experts: 4 + moe_top_k: 2 + moe_normalize_expert_weights: 0.1 + uniform_expert_assignment: false + moe_jitter_eps: 0.1 + moe_act_fn: + component_key: moe_act_fn + variant_key: silu + config: + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp + config: + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + +scheduler: + component_key: scheduler + variant_key: dummy_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + + +batch_progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + local_rank: ${settings.cuda_env.local_rank} + global_num_seen_steps: ${settings.training.global_num_seen_steps} + train_dataloader: + instance_key: train_dataloader + pass_type: BY_REFERENCE + eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + local_rank: ${settings.cuda_env.local_rank} + project: modalities_overfitting + mode: ONLINE + experiment_id: ${settings.experiment_id} + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/moe_configs/config_gpt2_small_redpajama_DE_1048576_moe.yaml b/config_files/training/moe_configs/config_gpt2_small_redpajama_DE_1048576_moe.yaml new file mode 100644 index 000000000..ece76f2e1 --- /dev/null +++ b/config_files/training/moe_configs/config_gpt2_small_redpajama_DE_1048576_moe.yaml @@ -0,0 +1,286 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + training: + global_training_log_interval_in_steps: 32 + global_checkpointing_interval_in_steps: 8192 + global_evaluation_interval_in_steps: 1024 + global_num_seen_steps: 0 + do_apply_activation_checkpointing: false + gradient_acc_steps: 1 + local_train_micro_batch_size: 16 + sequence_length: 2048 + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpointing_path: data/checkpoints + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/redpajama_v2/mem_map/redpajama_v2_default_DE_num_docs_1050391/redpyjama_v2_sample_1050391.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: repeating_data_loader + config: + reshuffle_after_epoch: false + num_epochs: 100 + dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: "train" + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +val_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/redpajama_v2/mem_map/redpyjama_v2_default_DE_num_docs_1024/redpyjama_v2_default_DE_num_docs_1024.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +val_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: "val" + dataset: + instance_key: val_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + dataset: + instance_key: val_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: fsdp + config: + checkpoint_path: ${settings.paths.checkpointing_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +wrapped_model: + component_key: model + variant_key: fsdp_wrapped + config: + model: + instance_key: model + pass_type: BY_REFERENCE + sync_module_states: true + mixed_precision_settings: BF_16 + sharding_strategy: FULL_SHARD + block_names: [MoEBlock] + +model: + component_key: model + variant_key: gpt2 + config: + sample_key: ${settings.referencing_keys.sample_key} + poe_type: ABSOLUTE + block_size: ${settings.training.sequence_length} + prediction_key: ${settings.referencing_keys.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 12 + n_head_q: 12 + n_embd: 768 + dropout: 0.0 + bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: [] + # - type_hint: RotaryTransform + # config: + # n_embd: ${model.config.n_embd} + # n_head: ${model.config.n_head_q} #it has to be head_q here + # seq_length_dim: -2 + activation_type: gelu + weight_init: + mean: 0.0 + std: 0.02 + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + lm_head_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: moe_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + attention_config: ${model.config.attention_config} + dropout: ${model.config.dropout} + ffn_hidden: 2048 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 12 + moe_num_experts: 4 + moe_top_k: 2 + moe_normalize_expert_weights: 0.1 + uniform_expert_assignment: false + moe_jitter_eps: 0.1 + moe_act_fn: + component_key: moe_act_fn + variant_key: silu + config: + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.000001 + betas: [0.9, 0.99] + eps: 1e-8 + weight_decay: 1e-2 + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + +scheduler: + component_key: scheduler + variant_key: dummy_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp + config: + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + + + +batch_progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + local_rank: ${settings.cuda_env.local_rank} + world_size: ${settings.cuda_env.world_size} + global_num_seen_steps: ${settings.training.global_num_seen_steps} + train_dataloader: + instance_key: train_dataloader + pass_type: BY_REFERENCE + eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + local_rank: ${settings.cuda_env.local_rank} + project: modalities + mode: ONLINE + experiment_id: ${settings.experiment_id} + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/moe_configs/config_lorem_ipsum_moe.yaml b/config_files/training/moe_configs/config_lorem_ipsum_moe.yaml new file mode 100644 index 000000000..9784fc7bb --- /dev/null +++ b/config_files/training/moe_configs/config_lorem_ipsum_moe.yaml @@ -0,0 +1,306 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + training: + global_training_log_interval_in_steps: 8 + global_checkpointing_interval_in_steps: 3 + global_evaluation_interval_in_steps: 2 + global_num_seen_steps: 0 + do_apply_activation_checkpointing: true + gradient_acc_steps: 1 + local_train_micro_batch_size: 1 + sequence_length: 256 + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpointing_path: data/checkpoints + +tokenizer: + component_key: tokenizer + variant_key: pretrained_hf_tokenizer + config: + pretrained_model_name_or_path: ./data/tokenizer/hf_gpt2 + max_length: ${settings.training.sequence_length} + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: ./data/lorem_ipsum.pbin + block_size: ${settings.training.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: "train" + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.training.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +val_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: "val" + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: 4 + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +test_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: "test" + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: 2 + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: + - instance_key: val_dataloader + pass_type: BY_REFERENCE + - instance_key: test_dataloader + pass_type: BY_REFERENCE + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: fsdp + config: + checkpoint_path: ${settings.paths.checkpointing_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +# resolving class types via different enums sucks... +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: target_ids + prediction_key: logits + +wrapped_model: + component_key: model + variant_key: fsdp_wrapped + config: + model: + instance_key: model + pass_type: BY_REFERENCE + sync_module_states: true + mixed_precision_settings: BF_16 + sharding_strategy: FULL_SHARD + block_names: [MoEBlock] + +model: + component_key: model + variant_key: gpt2 + config: + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + block_size: ${settings.training.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 2 + n_embd: 128 + dropout: 0.0 + n_head_q: 8 + weight_init: + mean: 0.0 + std: 0.02 + lm_head_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: moe_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + activation_type: gelu + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model.config.n_embd} + n_head: ${model.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + dropout: ${model.config.dropout} + ffn_hidden: 128 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 8 + moe_num_experts: 4 + moe_top_k: 2 + moe_normalize_expert_weights: 0.1 + uniform_expert_assignment: false + moe_jitter_eps: 0.1 + moe_act_fn: + component_key: moe_act_fn + variant_key: silu + config: + +scheduler: + component_key: scheduler + variant_key: onecycle_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + max_lr: 6e-4 + div_factor: 10 + final_div_factor: 1 + total_steps: 32 + pct_start: 0.01 + anneal_strategy: cos + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp + config: + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + +batch_progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + local_rank: ${settings.cuda_env.local_rank} + global_num_seen_steps: ${settings.training.global_num_seen_steps} + train_dataloader: + instance_key: train_dataloader + pass_type: BY_REFERENCE + eval_dataloaders: + instance_key: eval_dataloaders + pass_type: BY_REFERENCE + + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + local_rank: ${settings.cuda_env.local_rank} + project: modalities_lorem_ipsum_moe + experiment_id: ${settings.experiment_id} + mode: ONLINE + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/src/modalities/config/component_factory.py b/src/modalities/config/component_factory.py index 6a43a4379..54f9552b1 100644 --- a/src/modalities/config/component_factory.py +++ b/src/modalities/config/component_factory.py @@ -61,7 +61,7 @@ def _build_component( current_component_config = self._instantiate_component_config( component_key=component_key, variant_key=variant_key, - config_dict=materialized_component_config["config"], + config_dict=materialized_component_config.get("config") or dict(), ) # instantiate component component = self._instantiate_component( diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index a65ab0a6b..418e33a1a 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -13,6 +13,7 @@ from modalities.config.pydanctic_if_types import PydanticPytorchModuleType from modalities.config.utils import convert_base_model_config_to_dict from modalities.models.model import NNModel +from modalities.nn.moe import MoEFFN, MoEFFNConfig from modalities.util import parse_enum_by_name # GPT2 implementation taken from nanogpt https://github.com/karpathy/nanoGPT @@ -104,9 +105,11 @@ class QueryKeyValueTransformType(Enum): RotaryTransform = RotaryTransform +# FIXME Move or delete class ActivationType(str, Enum): GELU = "gelu" FUSED_SWIGLU = "fused_swiglu" + SILU = "silu" class AttentionConfig(BaseModel): @@ -129,43 +132,30 @@ def parse_sharding_strategy_by_name(cls, name): qkv_transforms: List[QueryKeyValueTransformConfig] -class WeightInitializationConfig(BaseModel): - mean: Annotated[float, Field(strict=True, ge=0.0)] - std: Annotated[float, Field(strict=True, ge=0.0)] - - -class GPT2LLMConfig(BaseModel): - sample_key: str - prediction_key: str - poe_type: PositionTypes - block_size: Annotated[int, Field(strict=True, ge=1)] - vocab_size: Annotated[ - int, Field(strict=True, ge=1) - ] # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency - n_layer: Annotated[int, Field(strict=True, ge=1)] - n_head_q: Annotated[int, Field(strict=True, ge=1)] - n_head_kv: Annotated[int, Field(strict=True, ge=1)] - n_embd: Annotated[int, Field(strict=True, ge=1)] - ffn_hidden: Annotated[int, Field(strict=True, ge=1)] - dropout: Annotated[float, Field(strict=True, ge=0.0)] - bias: bool # True: bias in Linears like GPT-2. False: a bit better and faster - attention_config: AttentionConfig +class GPT2BlockConfig(BaseModel): + n_embd: int + bias: bool + n_head_q: int + n_head_kv: int activation_type: ActivationType + attention_config: AttentionConfig + dropout: float + block_size: int + ffn_hidden: int attention_norm: PydanticPytorchModuleType ffn_norm: PydanticPytorchModuleType - lm_head_norm: PydanticPytorchModuleType - weight_init: WeightInitializationConfig @model_validator(mode="after") - def check_divisibility(self) -> "GPT2LLMConfig": + def check_divisibility(self) -> "GPT2BlockConfig": if self.n_head_q % self.n_head_kv != 0: raise ValueError("n_head_q must be divisible by n_head_kv") return self @model_validator(mode="after") - def validate_sizes(self) -> "GPT2LLMConfig": + def validate_sizes(self) -> "GPT2BlockConfig": for param, param_name in zip( - [self.ffn_hidden, self.vocab_size, self.n_embd], ["ffn_hidden", "vocab_size", "n_embd"] + [self.ffn_hidden, self.n_embd], + ["ffn_hidden", "n_embd"], ): if param % 128 != 0: # See https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc @@ -173,6 +163,20 @@ def validate_sizes(self) -> "GPT2LLMConfig": return self +class MoEBlockConfig(GPT2BlockConfig): + moe_num_experts: int + moe_top_k: int + moe_normalize_expert_weights: float + uniform_expert_assignment: bool + moe_act_fn: PydanticPytorchModuleType + moe_jitter_eps: float + + +class WeightInitializationConfig(BaseModel): + mean: Annotated[float, Field(strict=True, ge=0.0)] + std: Annotated[float, Field(strict=True, ge=0.0)] + + class CausalSelfAttention(nn.Module): def __init__( self, @@ -237,7 +241,11 @@ def projection(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch @staticmethod def execute_qkv_transforms( - q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, qkv_transforms: nn.ModuleList, n_head_q: int + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + qkv_transforms: nn.ModuleList, + n_head_q: int, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: batch_size, block_size, embedding_dim = q.size() n_head_dim = embedding_dim // n_head_q @@ -257,10 +265,22 @@ def execute_flash_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, d q = q.transpose(1, 2).contiguous() # (B, T, nh_q, hd) k = k.transpose(1, 2).contiguous() # (B, T, nh_kv, hd) v = v.transpose(1, 2).contiguous() # (B, T, nh_kv, hd) - return flash_attn_func(q, k, v, dropout_p=dropout, causal=True, softmax_scale=None, window_size=(-1, -1)) + return flash_attn_func( + q, + k, + v, + dropout_p=dropout, + causal=True, + softmax_scale=None, + window_size=(-1, -1), + ) def forward(self, x: torch.Tensor) -> torch.Tensor: - B, T, _ = x.size() # batch size (B), sequence length (T), embedding dimensionality (self.n_embd) + ( + B, + T, + _, + ) = x.size() # batch size (B), sequence length (T), embedding dimensionality (self.n_embd) q, k, v = self.projection(x) # q: (B, T, n_embd), k: (B, T, n_embd / n_rep), v: (B, T, n_embd / n_rep) # q: (B, nh_q, T, hd), k: (B, nh_kv, T, hd), v: (B, nh_kv, T, hd) @@ -312,22 +332,16 @@ def __init__( super().__init__() self.attention_norm = attention_norm self.ffn_norm = ffn_norm + self.attention_config = attention_config self.attn = CausalSelfAttention( n_head_q=n_head_q, n_head_kv=n_head_kv, n_embd=n_embd, - attention_config=attention_config, + attention_config=self.attention_config, bias=bias, dropout=dropout, block_size=block_size, ) - if activation_type == ActivationType.GELU: - self.mlp = TransformerMLP(n_embd=n_embd, ffn_hidden=ffn_hidden, bias=bias, dropout=dropout) - elif activation_type == ActivationType.FUSED_SWIGLU: - hidden_dim = 256 * ((int(2 * 4 * n_embd / 3) + 256 - 1) // 256) - self.mlp = xops.SwiGLU(n_embd, hidden_dim, n_embd, bias=False) - else: - raise NotImplementedError("unimplemented activation") def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.attention_norm(x) @@ -337,6 +351,117 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x +class TransformerBlock(GPT2Block): + def __init__( + self, + n_embd: int, + bias: bool, + n_head_q: int, + n_head_kv: int, + activation_type: ActivationType, + attention_config: AttentionConfig, + dropout: float, + block_size: int, + ffn_hidden: int, + attention_norm: nn.Module, + ffn_norm: nn.Module, + ): + super().__init__( + n_embd, + bias, + n_head_q, + n_head_kv, + activation_type, + attention_config, + dropout, + block_size, + ffn_hidden, + attention_norm, + ffn_norm, + ) + self.mlp = TransformerMLP(n_embd=n_embd, ffn_hidden=ffn_hidden, bias=bias, dropout=dropout) + + +class SwiGLUBlock(GPT2Block): + def __init__( + self, + n_embd: int, + bias: bool, + n_head_q: int, + n_head_kv: int, + activation_type: ActivationType, + attention_config: AttentionConfig, + dropout: float, + block_size: int, + ffn_hidden: int, + attention_norm: nn.Module, + ffn_norm: nn.Module, + ): + super().__init__( + n_embd, + bias, + n_head_q, + n_head_kv, + activation_type, + attention_config, + dropout, + block_size, + ffn_hidden, + attention_norm, + ffn_norm, + ) + hidden_dim = 256 * ((int(2 * 4 * n_embd / 3) + 256 - 1) // 256) + self.mlp = xops.SwiGLU(n_embd, hidden_dim, n_embd, bias=False) + + +class MoEBlock(GPT2Block): + def __init__( + self, + n_embd: int, + bias: bool, + n_head_q: int, + n_head_kv: int, + activation_type: ActivationType, + attention_config: AttentionConfig, + dropout: float, + block_size: int, + ffn_hidden: int, + attention_norm: nn.Module, + ffn_norm: nn.Module, + moe_num_experts: int, + moe_top_k: int, + moe_normalize_expert_weights: float, + uniform_expert_assignment: bool, + moe_act_fn: nn.Module, + moe_jitter_eps: float, + ): + super().__init__( + n_embd, + bias, + n_head_q, + n_head_kv, + activation_type, + attention_config, + dropout, + block_size, + ffn_hidden, + attention_norm, + ffn_norm, + ) + + moe_config = MoEFFNConfig( + moe_num_experts=moe_num_experts, + moe_top_k=moe_top_k, + moe_normalize_expert_weights=moe_normalize_expert_weights, + uniform_expert_assignment=uniform_expert_assignment, + ffn_hidden_size=ffn_hidden, + act_fn=lambda: deepcopy(moe_act_fn), + moe_jitter_eps=moe_jitter_eps, + ) + + self.mlp = MoEFFN(hidden_router_size=n_embd, config=moe_config) # change the ffn_hidden parameter's name + + class GPT2LLM(NNModel): def __init__( self, @@ -346,18 +471,12 @@ def __init__( block_size: int, vocab_size: int, n_layer: int, - n_head_q: int, - n_head_kv: int, n_embd: int, - ffn_hidden: int, + n_head_q: int, dropout: float, - bias: bool, - activation_type: ActivationType, weight_init: WeightInitializationConfig, - attention_config: AttentionConfig, - attention_norm: nn.Module, - ffn_norm: nn.Module, lm_head_norm: nn.Module, + gpt2block: GPT2Block, seed: int = None, ): super().__init__(seed=seed) @@ -380,9 +499,11 @@ def __init__( else: raise TypeError(f"{poe_type} not supported") - if poe_type is not PositionTypes.NOPE and RotaryTransform in [ - config.type_hint.value for config in attention_config.qkv_transforms - ]: + if ( + gpt2block.attention_config + and poe_type is not PositionTypes.NOPE + and RotaryTransform in [config.type_hint.value for config in gpt2block.attention_config.qkv_transforms] + ): raise ValueError('It is expected to use "RotaryTransform" together with "NOPE".') self.transformer = nn.ModuleDict( @@ -390,24 +511,7 @@ def __init__( wte=nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd), wpe=wpe, drop=nn.Dropout(dropout), - h=nn.ModuleList( - [ - GPT2Block( - n_embd=n_embd, - bias=bias, - n_head_q=n_head_q, - n_head_kv=n_head_kv, - activation_type=activation_type, - attention_config=attention_config, - dropout=dropout, - block_size=block_size, - ffn_hidden=ffn_hidden, - attention_norm=deepcopy(attention_norm), - ffn_norm=deepcopy(ffn_norm), - ) - for _ in range(n_layer) - ] - ), + h=nn.ModuleList([deepcopy(gpt2block) for _ in range(n_layer)]), ln_f=lm_head_norm, ) ) @@ -423,7 +527,11 @@ def __init__( # apply special scaled init to the residual projections, per GPT-2 paper for pn, p in self.named_parameters(): if pn.endswith("c_proj.weight"): - torch.nn.init.normal_(p, mean=weight_init.mean, std=weight_init.std / math.sqrt(2 * n_layer)) + torch.nn.init.normal_( + p, + mean=weight_init.mean, + std=weight_init.std / math.sqrt(2 * n_layer), + ) def _init_weights(self, module: nn.Module, weight_init: WeightInitializationConfig): if isinstance(module, nn.Linear): @@ -458,3 +566,28 @@ def forward_impl(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tenso def forward(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: return self.forward_impl(inputs) + + +class GPT2LLMConfig(BaseModel): + sample_key: str + prediction_key: str + poe_type: PositionTypes + block_size: Annotated[int, Field(strict=True, ge=1)] + vocab_size: Annotated[ + int, Field(strict=True, ge=1) + ] # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: Annotated[int, Field(strict=True, ge=1)] + n_embd: Annotated[int, Field(strict=True, ge=1)] + n_head_q: Annotated[int, Field(strict=True, ge=1)] + dropout: Annotated[float, Field(strict=True, ge=0.0)] + lm_head_norm: PydanticPytorchModuleType + weight_init: WeightInitializationConfig + gpt2block: PydanticPytorchModuleType + + @model_validator(mode="after") + def validate_sizes(self) -> "GPT2LLMConfig": + for param, param_name in zip([self.vocab_size, self.n_embd], ["vocab_size", "n_embd"]): + if param % 128 != 0: + # See https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc + raise ValueError(f"{param_name} with value {param} should be divisible by 128 for efficient training.") + return self diff --git a/src/modalities/nn/activations.py b/src/modalities/nn/activations.py new file mode 100644 index 000000000..5913e00f3 --- /dev/null +++ b/src/modalities/nn/activations.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +class ActivationConfig(BaseModel): + pass + +class LeakyReLUConfig(ActivationConfig): + scale: float + negative_slope: float = 0.01 + zero_point: int \ No newline at end of file diff --git a/src/modalities/nn/moe.py b/src/modalities/nn/moe.py new file mode 100644 index 000000000..0501181cc --- /dev/null +++ b/src/modalities/nn/moe.py @@ -0,0 +1,146 @@ +from typing import Callable, Optional, Tuple + +import torch +import torch.nn as nn +from pydantic import BaseModel + + +# MoE implementation inspired from Dbrx https://github.com/databricks/dbrx/blob/main/model/modeling_dbrx.py +class MoEFFNConfig(BaseModel): + moe_num_experts: int + moe_top_k: int + moe_normalize_expert_weights: float + uniform_expert_assignment: bool + ffn_hidden_size: int + act_fn: Callable[[], nn.Module] = nn.SiLU + moe_jitter_eps: Optional[float] + + +class MoERouter(nn.Module): + def __init__(self, hidden_size: int, moe_config: MoEFFNConfig): + super().__init__() + self.hidden_size = hidden_size + self.moe_num_experts = moe_config.moe_num_experts + self.moe_top_k = moe_config.moe_top_k + self.moe_normalize_expert_weights = moe_config.moe_normalize_expert_weights + self.uniform_expert_assignment = moe_config.uniform_expert_assignment + self.moe_jitter_eps = moe_config.moe_jitter_eps + + self.layer = nn.Linear(self.hidden_size, self.moe_num_experts, bias=False) + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.LongTensor]: + if self.training and self.moe_jitter_eps is not None: + x = x * self._jitter(x) + + weights = self.layer(x.view(-1, x.shape[-1])).softmax(dim=-1, dtype=torch.float32) + top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1) + + if self.moe_normalize_expert_weights: + top_weights = top_weights / torch.norm( + top_weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True + ) + + if self.uniform_expert_assignment: + with torch.no_grad(): + uniform_tensor = ( + torch.arange(0, top_experts.numel(), device=top_experts.device, dtype=top_experts.dtype) + % self.moe_num_experts + ) + top_experts = uniform_tensor.reshape(top_experts.shape) + # Note, weights and top_weights are not changed + + top_weights = top_weights.to(x.dtype) + return top_weights, top_experts + + def _jitter(self, x: torch.Tensor) -> torch.Tensor: + if self.moe_jitter_eps is None: + raise RuntimeError("The router does not have moe_jitter_eps set.") + low = 1.0 - self.moe_jitter_eps + high = 1.0 + self.moe_jitter_eps + noise = torch.rand(x.size(), dtype=x.dtype, device=x.device) + return low + noise * (high - low) + + +class MoEExpertGLU(nn.Module): + def __init__( + self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, act_fn: Callable[[], nn.Module] = nn.GELU + ): + super().__init__() + self.hidden_size = hidden_size + self.ffn_hidden_size = ffn_hidden_size + self.moe_num_experts = moe_num_experts + self.activation_fn = act_fn() + + self.w1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size)) + self.v1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size)) + self.w2 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size)) + + with torch.no_grad(): + torch.nn.init.normal_(self.w1) + torch.nn.init.normal_(self.v1) + torch.nn.init.normal_(self.w2) + + def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor: + expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx] + expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx] + expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx] + + x1 = x.matmul(expert_w1.t()) + x2 = x.matmul(expert_v1.t()) + x1 = self.activation_fn(x1) + x1 = x1 * x2 + x1 = x1.matmul(expert_w2) + return x1 + + +class MoEExperts(nn.Module): + def __init__( + self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, act_fn: Callable[[], nn.Module] = nn.GELU + ): + super().__init__() + self.moe_num_experts = moe_num_experts + self.mlp = MoEExpertGLU( + hidden_size=hidden_size, ffn_hidden_size=ffn_hidden_size, moe_num_experts=moe_num_experts, act_fn=act_fn + ) + + def forward(self, x: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor) -> torch.Tensor: + bsz, q_len, hidden_size = x.shape + x = x.view(-1, hidden_size) + out = torch.zeros_like(x) + + expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0) + for expert_idx in range(0, self.moe_num_experts): + topk_idx, token_idx = torch.where(expert_mask[expert_idx]) + if token_idx.shape[0] == 0: + continue + + token_list = token_idx.tolist() + topk_list = topk_idx.tolist() + + expert_tokens = x[None, token_list].reshape(-1, hidden_size) + expert_out = self.mlp(expert_tokens, expert_idx) * top_weights[token_list, topk_list, None] + + out.index_add_(0, token_idx, expert_out) + + out = out.reshape(bsz, q_len, hidden_size) + return out + + +class MoEFFN(nn.Module): + def __init__(self, hidden_router_size: int, config: MoEFFNConfig): + super().__init__() + self.config = config + + self.router = MoERouter(hidden_router_size, config) + + self.experts = MoEExperts( + hidden_size=hidden_router_size, + ffn_hidden_size=self.config.ffn_hidden_size, + moe_num_experts=self.config.moe_num_experts, + act_fn=self.config.act_fn, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + top_weights, top_experts = self.router(x) + out = self.experts(x, top_weights, top_experts) + return out diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index 62fafb503..f546165ce 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -61,7 +61,15 @@ from modalities.models.coca.collator import CoCaCollateFnConfig, CoCaCollatorFn from modalities.models.components.layer_norms import LayerNormConfig, RMSLayerNorm, RMSLayerNormConfig from modalities.models.gpt2.collator import GPT2LLMCollateFn -from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2LLMConfig +from modalities.models.gpt2.gpt2_model import ( + GPT2LLM, + GPT2BlockConfig, + GPT2LLMConfig, + MoEBlock, + MoEBlockConfig, + SwiGLUBlock, + TransformerBlock, +) from modalities.models.huggingface.huggingface_models import ( HuggingFacePretrainedModel, HuggingFacePretrainedModelConfig, @@ -69,6 +77,7 @@ from modalities.models.mamba.mamba_config import MambaLLMConfig from modalities.models.mamba.mamba_model import MambaLLM from modalities.models.model_factory import ModelFactory +from modalities.nn.activations import ActivationConfig, LeakyReLUConfig from modalities.optimizers.lr_schedulers import DummyLRScheduler from modalities.optimizers.optimizer_factory import OptimizerFactory from modalities.tokenization.tokenizer_wrapper import PreTrainedHFTokenizer, PreTrainedSPTokenizer @@ -97,10 +106,23 @@ class ComponentEntity: ComponentEntity("model", "gpt2", GPT2LLM, GPT2LLMConfig), ComponentEntity("model", "mamba", MambaLLM, MambaLLMConfig), ComponentEntity( - "model", "huggingface_pretrained_model", HuggingFacePretrainedModel, HuggingFacePretrainedModelConfig + "model", + "huggingface_pretrained_model", + HuggingFacePretrainedModel, + HuggingFacePretrainedModelConfig, + ), + ComponentEntity( + "model", + "checkpointed", + ModelFactory.get_checkpointed_model, + CheckpointedModelConfig, + ), + ComponentEntity( + "model", + "fsdp_wrapped", + ModelFactory.get_fsdp_wrapped_model, + FSDPWrappedModelConfig, ), - ComponentEntity("model", "checkpointed", ModelFactory.get_checkpointed_model, CheckpointedModelConfig), - ComponentEntity("model", "fsdp_wrapped", ModelFactory.get_fsdp_wrapped_model, FSDPWrappedModelConfig), ComponentEntity("model", "coca", CoCa, CoCaConfig), # losses ComponentEntity("loss", "clm_cross_entropy_loss", CLMCrossEntropyLoss, CLMCrossEntropyLossConfig), @@ -108,22 +130,53 @@ class ComponentEntity: ComponentEntity("optimizer", "adam", OptimizerFactory.get_adam, AdamOptimizerConfig), ComponentEntity("optimizer", "adam_w", OptimizerFactory.get_adam_w, AdamWOptimizerConfig), ComponentEntity( - "optimizer", "checkpointed", OptimizerFactory.get_checkpointed_optimizer, CheckpointedOptimizerConfig + "optimizer", + "checkpointed", + OptimizerFactory.get_checkpointed_optimizer, + CheckpointedOptimizerConfig, ), # schedulers ComponentEntity("scheduler", "dummy_lr", DummyLRScheduler, DummyLRSchedulerConfig), ComponentEntity("scheduler", "step_lr", torch.optim.lr_scheduler.StepLR, StepLRSchedulerConfig), - ComponentEntity("scheduler", "constant_lr", torch.optim.lr_scheduler.ConstantLR, ConstantLRSchedulerConfig), - ComponentEntity("scheduler", "onecycle_lr", torch.optim.lr_scheduler.OneCycleLR, OneCycleLRSchedulerConfig), ComponentEntity( - "scheduler", "cosine_annealing_lr", torch.optim.lr_scheduler.CosineAnnealingLR, CosineAnnealingLRSchedulerConfig + "scheduler", + "constant_lr", + torch.optim.lr_scheduler.ConstantLR, + ConstantLRSchedulerConfig, + ), + ComponentEntity( + "scheduler", + "onecycle_lr", + torch.optim.lr_scheduler.OneCycleLR, + OneCycleLRSchedulerConfig, + ), + ComponentEntity( + "scheduler", + "cosine_annealing_lr", + torch.optim.lr_scheduler.CosineAnnealingLR, + CosineAnnealingLRSchedulerConfig, ), # tokenizers - ComponentEntity("tokenizer", "pretrained_hf_tokenizer", PreTrainedHFTokenizer, PreTrainedHFTokenizerConfig), - ComponentEntity("tokenizer", "pretrained_sp_tokenizer", PreTrainedSPTokenizer, PreTrainedSPTokenizerConfig), + ComponentEntity( + "tokenizer", + "pretrained_hf_tokenizer", + PreTrainedHFTokenizer, + PreTrainedHFTokenizerConfig, + ), + ComponentEntity( + "tokenizer", + "pretrained_sp_tokenizer", + PreTrainedSPTokenizer, + PreTrainedSPTokenizerConfig, + ), # ComponentEntity("tokenizer", "llama_tokenizer_fast", GPT2TokenizerFast, None), # TODO # datasets - ComponentEntity("dataset", "mem_map_dataset", DatasetFactory.get_mem_map_dataset, MemMapDatasetConfig), + ComponentEntity( + "dataset", + "mem_map_dataset", + DatasetFactory.get_mem_map_dataset, + MemMapDatasetConfig, + ), ComponentEntity( "dataset", "packed_mem_map_dataset_continuous", @@ -137,7 +190,10 @@ class ComponentEntity: PackedMemMapDatasetMegatronConfig, ), ComponentEntity( - "dataset", "open_gptx_mmap_dataset", DatasetFactory.get_open_gptx_mmap_dataset, OpenGPTXMMapDatasetConfig + "dataset", + "open_gptx_mmap_dataset", + DatasetFactory.get_open_gptx_mmap_dataset, + OpenGPTXMMapDatasetConfig, ), ComponentEntity("dataset", "dummy_dataset", DatasetFactory.get_dummy_dataset, DummyDatasetConfig), # samplers @@ -150,7 +206,10 @@ class ComponentEntity: # data loaders ComponentEntity("data_loader", "default", DataloaderFactory.get_dataloader, LLMDataLoaderConfig), ComponentEntity( - "data_loader", "repeating_data_loader", DataloaderFactory.get_repeating_dataloader, RepeatingDataLoaderConfig + "data_loader", + "repeating_data_loader", + DataloaderFactory.get_repeating_dataloader, + RepeatingDataLoaderConfig, ), # checkpointing ComponentEntity("checkpoint_saving", "default", CheckpointSaving, CheckpointSavingConfig), @@ -168,10 +227,20 @@ class ComponentEntity: SaveKMostRecentCheckpointsStrategyConfig, ), # checkpoint saving execution - ComponentEntity("checkpoint_saving_execution", "fsdp", FSDPCheckpointSaving, FSDPCheckpointSavingConfig), + ComponentEntity( + "checkpoint_saving_execution", + "fsdp", + FSDPCheckpointSaving, + FSDPCheckpointSavingConfig, + ), # checkpoint loading ComponentEntity("checkpoint_loading", "fsdp", FSDPCheckpointLoading, FSDPCheckpointLoadingConfig), - ComponentEntity("checkpoint_loading", "torch", TorchCheckpointLoading, TorchCheckpointLoadingConfig), + ComponentEntity( + "checkpoint_loading", + "torch", + TorchCheckpointLoading, + TorchCheckpointLoadingConfig, + ), # Progress subscriber ComponentEntity( "progress_subscriber", @@ -187,10 +256,16 @@ class ComponentEntity: ), # Results subscriber ComponentEntity( - "results_subscriber", "dummy", ResultsSubscriberFactory.get_dummy_result_subscriber, DummyResultSubscriberConfig + "results_subscriber", + "dummy", + ResultsSubscriberFactory.get_dummy_result_subscriber, + DummyResultSubscriberConfig, ), ComponentEntity( - "results_subscriber", "rich", ResultsSubscriberFactory.get_rich_result_subscriber, RichResultSubscriberConfig + "results_subscriber", + "rich", + ResultsSubscriberFactory.get_rich_result_subscriber, + RichResultSubscriberConfig, ), ComponentEntity( "results_subscriber", @@ -201,10 +276,21 @@ class ComponentEntity: # layer norms ComponentEntity("layer_norm", "rms_norm", RMSLayerNorm, RMSLayerNormConfig), ComponentEntity("layer_norm", "layer_norm", nn.LayerNorm, LayerNormConfig), + # block configs + ComponentEntity("block", "moe_block", MoEBlock, MoEBlockConfig), + ComponentEntity("block", "transformer_block", TransformerBlock, GPT2BlockConfig), + ComponentEntity("block", "swiglu_block", SwiGLUBlock, GPT2BlockConfig), + # moe activation_fn configs + ComponentEntity("moe_act_fn", "silu", nn.SiLU, ActivationConfig), + ComponentEntity("moe_act_fn", "relu", nn.ReLU, ActivationConfig), + ComponentEntity("moe_act_fn", "leaky_relu", nn.LeakyReLU, LeakyReLUConfig), # gradient clippers ComponentEntity("gradient_clipper", "fsdp", FSDPGradientClipper, FSDPGradientClipperConfig), ComponentEntity( - "gradient_clipper", "fsdp_logging_only", FSDPLoggingOnlyGradientClipper, FSDPDummyGradientClipperConfig + "gradient_clipper", + "fsdp_logging_only", + FSDPLoggingOnlyGradientClipper, + FSDPDummyGradientClipperConfig, ), ComponentEntity("gradient_clipper", "dummy", DummyGradientClipper, DummyGradientClipperConfig), ] diff --git a/tests/nn/test_moe.py b/tests/nn/test_moe.py new file mode 100644 index 000000000..bce963723 --- /dev/null +++ b/tests/nn/test_moe.py @@ -0,0 +1,121 @@ +import pytest +import torch +import torch.nn as nn + +from modalities.nn.moe import MoEExpertGLU, MoEExperts, MoEFFN, MoEFFNConfig, MoERouter + + +def test_moe_router_produces_expected_shapes( + model_input: torch.Tensor, + batch_size: int, + seq_length: int, + moe_top_k: int, + moe_router: MoERouter, +): + top_weights, top_experts = moe_router.forward(model_input) + assert top_weights.shape == (batch_size * seq_length, moe_top_k) + assert top_experts.shape == (batch_size * seq_length, moe_top_k) + + +def test_moe_router_jitter_does_not_change_shape(model_input: torch.Tensor, moe_router: MoERouter): + jittered_model_input = moe_router._jitter(model_input) + assert jittered_model_input.shape == model_input.shape + + +def test_moe_expert_produces_expected_shape( + model_input: torch.Tensor, batch_size: int, seq_length: int, hidden_size: int, moe_num_experts: int +): + ffn_hidden_size = 128 + act_fn = nn.ReLU + model = MoEExpertGLU(hidden_size, ffn_hidden_size, moe_num_experts, act_fn) + expert_idx = 2 + output = model.forward(model_input, expert_idx) + assert output.shape == (batch_size, seq_length, hidden_size) + + +def test_moe_expert_errors_with_invalid_idx(model_input: torch.Tensor, hidden_size: int, moe_num_experts: int): + ffn_hidden_size = 128 + act_fn = nn.ReLU + + model = MoEExpertGLU(hidden_size, ffn_hidden_size, moe_num_experts, act_fn) + invalid_expert_idx = moe_num_experts + 2 + + with pytest.raises(IndexError): + model.forward(model_input, invalid_expert_idx) + + +def test_moe_experts_produce_expected_shape( + batch_size: int, seq_length: int, hidden_size: int, moe_num_experts: int, moe_top_k: int +): + ffn_hidden_size = 64 + act_fn = nn.ReLU + + model = MoEExperts(hidden_size, ffn_hidden_size, moe_num_experts, act_fn) + x = torch.rand(batch_size, seq_length, hidden_size) + top_weights = torch.rand(batch_size * seq_length, moe_top_k) + top_experts = torch.randint(0, moe_top_k, (batch_size * seq_length, moe_top_k)) + + output = model(x, top_weights, top_experts) + + assert output.shape == (batch_size, seq_length, hidden_size) + + +def test_moeffn_output_shape(batch_size: int, seq_length: int, moe_config: MoEFFNConfig): + hidden_router_size = 128 + model = MoEFFN(hidden_router_size, moe_config) + input_tensor = torch.randn(batch_size, seq_length, hidden_router_size) + output = model(input_tensor) + assert output.shape == (batch_size, seq_length, hidden_router_size) + + +@pytest.fixture +def moe_router(hidden_size: int, moe_config: MoEFFNConfig) -> MoERouter: + return MoERouter(hidden_size, moe_config) + + +@pytest.fixture +def moe_config(moe_num_experts: int, moe_top_k: int, uniform_expert_assignment: bool) -> MoEFFNConfig: + return MoEFFNConfig( + moe_num_experts=moe_num_experts, + moe_top_k=moe_top_k, + moe_normalize_expert_weights=2.0, + uniform_expert_assignment=uniform_expert_assignment, + ffn_hidden_size=128, + act_fn=nn.ReLU, + moe_jitter_eps=0.1, + ) + + +@pytest.fixture +def model_input(batch_size: int, seq_length: int, hidden_size: int) -> torch.Tensor: + return torch.randn(batch_size, seq_length, hidden_size) + + +@pytest.fixture +def batch_size() -> int: + return 4 + + +@pytest.fixture +def seq_length() -> int: + return 32 + + +@pytest.fixture +def hidden_size() -> int: + return 10 + + +@pytest.fixture +def moe_num_experts() -> int: + return 5 + + +@pytest.fixture +def moe_top_k() -> int: + return 3 + + +@pytest.fixture(params=[True, False]) +def uniform_expert_assignment(request: pytest.FixtureRequest) -> bool: + return request.param diff --git a/tests/test_yaml_configs/config_lorem_ipsum.yaml b/tests/test_yaml_configs/config_lorem_ipsum.yaml index 8f832e998..732989973 100644 --- a/tests/test_yaml_configs/config_lorem_ipsum.yaml +++ b/tests/test_yaml_configs/config_lorem_ipsum.yaml @@ -169,7 +169,7 @@ wrapped_model: sync_module_states: true mixed_precision_settings: BF_16 sharding_strategy: FULL_SHARD - block_names: [GPT2Block] + block_names: [TransformerBlock] model: component_key: model @@ -182,36 +182,11 @@ model: vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: 2 n_head_q: 8 - n_head_kv: 8 - ffn_hidden: 128 n_embd: 128 - dropout: 0.0 - bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster - attention_config: - qkv_transforms: - - type_hint: RotaryTransform - config: - n_embd: ${model.config.n_embd} - n_head: ${model.config.n_head_q} #it has to be head_q here - seq_length_dim: -2 - activation_type: gelu + dropout: 0.0 weight_init: mean: 0.0 std: 0.02 - attention_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 - ffn_norm: - component_key: layer_norm - variant_key: rms_norm - config: - ndim: ${model.config.n_embd} - bias: true - epsilon: 1e-5 lm_head_norm: component_key: layer_norm variant_key: rms_norm @@ -219,6 +194,39 @@ model: ndim: ${model.config.n_embd} bias: true epsilon: 1e-5 + gpt2block: + component_key: block + variant_key: transformer_block + config: + block_size: ${settings.training.sequence_length} + n_embd: ${model.config.n_embd} + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + activation_type: gelu + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model.config.n_embd} + n_head: ${model.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + dropout: ${model.config.dropout} + ffn_hidden: 128 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + n_head_q: ${model.config.n_head_q} + n_head_kv: 8 scheduler: component_key: scheduler