diff --git a/examples/deepseek/README.md b/examples/deepseek/README.md new file mode 100644 index 0000000..312b92a --- /dev/null +++ b/examples/deepseek/README.md @@ -0,0 +1,7 @@ +# Example Analyses with DeepSeek-V3 + +[DeepSeek-V3 Technical Report](https://arxiv.org/abs/2412.19437) + +```sh +./run_pretrain.sh +``` diff --git a/examples/deepseek/run_pretrain.sh b/examples/deepseek/run_pretrain.sh new file mode 100755 index 0000000..e4e2683 --- /dev/null +++ b/examples/deepseek/run_pretrain.sh @@ -0,0 +1,28 @@ +# https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py + +model_name=deepseek-ai/DeepSeek-V3 # model name (hf model name) or model config json file path +seq_len=4096 +gpu_name='h800-sxm-80gb' # python -m llm_analysis.config list_gpu_configs +dtype_name="w16a16e16l8" # 16-bit weights, activations for non-linear ops and embedding, 8-bit linear ops +flops_efficiency=0.25 # mfu +batch_size_per_gpu=1 # device_train_microbatch_size +num_gpus=2048 # num_gpus +activation_recomputation=2 # 0: no activation recomputation; 1: checkpoint attention compute; 2: checkpoint attention ; 3: checkpoint layernorm-attention-layernorm; 4: checkpoint attention the entire transformer layer +dp_size=128 # data parallelization size for sharding +ep_size=64 # expert parallelization size, moe_dp_sharding_size = dp_size / ep_size +tp_size=1 # tensor parallelization size +pp_size=16 # pipeline parallelization size +ds_zero=1 # dp sharding strategy, https://github.com/cli99/llm-analysis#parallelism-scheme +mlp_activation_quant_bits=8 # number of bits used for mlp activation +mlp_recompute_act=True # whether to recompute the gelu in mlp backward +hbm_memory_efficiency=0.8 # gpu memory efficiency +intra_node_memory_efficiency=0.8 +inter_node_memory_efficiency=0.8 +total_num_tokens=14.8e12 # number of tokens to train on +master_weights_dtype_bytes=4 # FP32 master weights +other_op_bytes=4 # BF16 first and second moments AdamW optimizer +output_dir=output_deepseek +output_file_prefix="bs${batch_size_per_gpu}-ar${activation_recomputation}-zero${ds_zero}-" +layernorm_dtype_bytes=2 + +python -m llm_analysis.analysis train --model_name=${model_name} --seq_len=${seq_len} --gpu_name=${gpu_name} --dtype_name=${dtype_name} --output_dir=${output_dir} --output-file-prefix=${output_file_prefix} --activation_recomputation ${activation_recomputation} --ds_zero ${ds_zero} --batch_size_per_gpu=${batch_size_per_gpu} --total_num_gpus=${num_gpus} --dp_size=${dp_size} --tp_size=${tp_size} --pp_size=${pp_size} --ep_size=${ep_size} --flops_efficiency=${flops_efficiency} --hbm_memory_efficiency=${hbm_memory_efficiency} --total_num_tokens ${total_num_tokens} --mlp_activation_quant_bits ${mlp_activation_quant_bits} --layernorm_dtype_bytes=${layernorm_dtype_bytes} --mlp_recompute_act ${mlp_recompute_act} --master_weights_dtype_bytes ${master_weights_dtype_bytes} --other_op_bytes ${other_op_bytes} --intra_node_memory_efficiency ${intra_node_memory_efficiency} --inter_node_memory_efficiency ${inter_node_memory_efficiency}