From 4cf3f4b19b7a433682526f3781f87cd96e3f5cca Mon Sep 17 00:00:00 2001 From: Joe Mayer <114769929+jomayeri@users.noreply.github.com> Date: Thu, 3 Nov 2022 10:34:22 -0700 Subject: [PATCH 01/11] Modifying loss checking to support bf16. (#92) --- megatron/training.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 8780582ef..f02df5ba2 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -963,7 +963,10 @@ def train(forward_step_func, model, optimizer, lr_scheduler, # Logging. if args.deepspeed: - loss_scale = model[0].optimizer.cur_scale + if hasattr(model[0].optimizer, 'cur_scale'): + loss_scale = model[0].optimizer.cur_scale + else: + loss_scale = None else: loss_scale = optimizer.get_loss_scale().item() params_norm = None @@ -1233,4 +1236,4 @@ def build_train_valid_test_data_iterators( else: test_data_iterator = None - return train_data_iterator, valid_data_iterator, test_data_iterator \ No newline at end of file + return train_data_iterator, valid_data_iterator, test_data_iterator From 238ab6df7e7561884fa2f3993d69099758105946 Mon Sep 17 00:00:00 2001 From: savitamittal1 <39776179+savitamittal1@users.noreply.github.com> Date: Mon, 7 Nov 2022 10:14:39 -0800 Subject: [PATCH 02/11] Updated to Curated acpt env and removed deepspeed install from github (#88) * updated curated env * updated to use aml curated env --- examples/azureml/Dockerfile.dockerfile | 13 ++----------- examples/azureml/aml_submit.py | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/examples/azureml/Dockerfile.dockerfile b/examples/azureml/Dockerfile.dockerfile index 2d43612f5..6e70a2d56 100644 --- a/examples/azureml/Dockerfile.dockerfile +++ b/examples/azureml/Dockerfile.dockerfile @@ -1,14 +1,5 @@ -FROM mcr.microsoft.com/azureml/aifx/stable-ubuntu2004-cu115-py38-torch1110 - +FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.5-gpu USER root:root RUN pip install pybind11 - -RUN pip install git+https://github.com/microsoft/DeepSpeed.git - -# add a100-topo.xml -RUN mkdir -p /opt/microsoft/ -RUN wget -O /opt/microsoft/a100-topo.xml https://hpcbenchmarks.blob.core.windows.net/bookcorpus/data/a100-topo.xml - -# to use on A100, enable env var below in your job -ENV NCCL_TOPO_FILE="/opt/microsoft/a100-topo.xml" +RUN pip install regex \ No newline at end of file diff --git a/examples/azureml/aml_submit.py b/examples/azureml/aml_submit.py index 58ea6f683..ebfa0a9bf 100644 --- a/examples/azureml/aml_submit.py +++ b/examples/azureml/aml_submit.py @@ -8,6 +8,7 @@ from azureml.core.compute import ComputeTarget, AmlCompute from azureml.core.compute_target import ComputeTargetException from azureml.core.runconfig import PyTorchConfiguration +from azureml.core.environment import DockerBuildContext # Check core SDK version number print("SDK version:", azureml.core.VERSION) @@ -64,15 +65,9 @@ #------------------------------------------------------------------------------- # Setup training environment #------------------------------------------------------------------------------- -megatron_ds_env = Environment.from_dockerfile(name='megatron-ds-ptca', dockerfile='Dockerfile.dockerfile') -megatron_ds_env.register(ws).build(ws).wait_for_completion() # Comment this out if environment already exists -megatron_ds_env.environment_variables['NCCL_DEBUG'] = 'WARN' -megatron_ds_env.environment_variables['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' -megatron_ds_env.environment_variables['NCCL_SOCKET_IFNAME'] = 'eth0' -megatron_ds_env.environment_variables['NCCL_IB_PCI_RELAXED_ORDERING']='1' -megatron_ds_env.environment_variables['UCX_TLS']='tcp' -megatron_ds_env.environment_variables['UCX_NET_DEVICES']='eth0' +megatron_ds_env = Environment.from_docker_build_context(name='megatron-ds-curated-acpt', docker_build_context=DockerBuildContext.from_local_directory(workspace = ws, path = '.', dockerfile_path='Dockerfile.dockerfile')) +megatron_ds_env.register(ws).build(ws).wait_for_completion() # Comment this out if environment already exists #------------------------------------------------------------------------------- # Training Settings and Arguments @@ -187,6 +182,13 @@ environment=megatron_ds_env, distributed_job_config=distr_config) +megatron_ds_src.run_config.environment_variables['NCCL_DEBUG'] = 'WARN' +megatron_ds_src.run_config.environment_variables['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' +megatron_ds_src.run_config.environment_variables['NCCL_SOCKET_IFNAME'] = 'eth0' +megatron_ds_src.run_config.environment_variables['NCCL_IB_PCI_RELAXED_ORDERING']='1' +megatron_ds_src.run_config.environment_variables['UCX_TLS']='tcp' +megatron_ds_src.run_config.environment_variables['UCX_NET_DEVICES']='eth0' + #------------------------------------------------------------------------------- # Submit experiment #------------------------------------------------------------------------------- From c685fb5e4973864ab0d0ad30e55edc014e151ca5 Mon Sep 17 00:00:00 2001 From: Hubert Lu <55214931+hubertlu-tw@users.noreply.github.com> Date: Fri, 18 Nov 2022 07:20:46 -0800 Subject: [PATCH 03/11] Fix the bug of FusedLayerNorm on ROCm (#96) --- .../fused_kernels/layer_norm_cuda_kernel.cu | 57 ++++++++++++------- megatron/model/fused_layer_norm.py | 7 ++- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu index efd2ff707..8a07806b1 100644 --- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu +++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu @@ -76,7 +76,8 @@ void cuWelfordMuSigma2( const int i1, U& mu, U& sigma2, - U* buf) + U* buf, + const int GPU_WARP_SIZE) { // Assumptions: // 1) blockDim.x == warpSize @@ -106,12 +107,11 @@ void cuWelfordMuSigma2( cuWelfordOnlineSum(curr,mu,sigma2,count); } // intra-warp reductions - for (int l = 0; l <= 4; ++l) { - int srcLaneB = (threadIdx.x+(1<(muB,sigma2B,countB,mu,sigma2,count); + for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { + U sigma2B = WARP_SHFL_DOWN(sigma2, stride); + U muB = WARP_SHFL_DOWN(mu, stride); + U countB = WARP_SHFL_DOWN(count, stride); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -160,7 +160,8 @@ void cuWelfordMuSigma2( const int i1, float& mu, float& sigma2, - float* buf) + float* buf, + const int GPU_WARP_SIZE) { // Assumptions: // 1) blockDim.x == warpSize @@ -201,12 +202,11 @@ void cuWelfordMuSigma2( cuWelfordOnlineSum(curr,mu,sigma2,count); } // intra-warp reductions - for (int l = 0; l <= 4; ++l) { - int srcLaneB = (threadIdx.x+(1< 0; stride /= 2) { + float sigma2B = WARP_SHFL_DOWN(sigma2, stride); + float muB = WARP_SHFL_DOWN(mu, stride); + float countB = WARP_SHFL_DOWN(count, stride); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -308,7 +308,8 @@ void cuApplyLayerNorm( const int n2, const U epsilon, const V* __restrict__ gamma, - const V* __restrict__ beta + const V* __restrict__ beta, + const int GPU_WARP_SIZE ) { // Assumptions: @@ -323,7 +324,7 @@ void cuApplyLayerNorm( SharedMemory shared; U* buf = shared.getPointer(); U mu,sigma2; - cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf); + cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf,GPU_WARP_SIZE); const T* lvals = vals + i1*n2; V* ovals = output_vals + i1*n2; U c_invvar = rsqrt(sigma2 + epsilon); @@ -686,7 +687,11 @@ void HostApplyLayerNorm( ) { auto stream = at::cuda::getCurrentCUDAStream().stream(); - const dim3 threads(32,4,1); + const int warp_size = at::cuda::warp_size(); + dim3 threads(warp_size,4,1); +#ifndef __HIP_PLATFORM_HCC__ + threads.y = 1; +#endif const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1); @@ -701,7 +706,9 @@ void HostApplyLayerNorm( input, n1,n2, U(epsilon), - gamma,beta); + gamma, + beta, + warp_size); } @@ -754,11 +761,16 @@ void HostLayerNormGradient( ) { auto stream = at::cuda::getCurrentCUDAStream().stream(); + const int warp_size = at::cuda::warp_size(); if (gamma != NULL && beta != NULL) { // compute grad_gamma(j) and grad_beta(j) +#ifndef __HIP_PLATFORM_HCC__ + const int part_size = warp_size; +#else const int part_size = 16; - const dim3 threads2(32,4,1); +#endif + const dim3 threads2(warp_size,4,1); const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1); const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1); @@ -777,7 +789,7 @@ void HostLayerNormGradient( part_grad_gamma.DATA_PTR(), part_grad_beta.DATA_PTR()); - const dim3 threads3(32,8,1); + const dim3 threads3(warp_size,8,1); const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1); const int nshared3 = threads3.x * threads3.y * sizeof(U); cuComputeGradGammaBeta<<>>( @@ -793,7 +805,10 @@ void HostLayerNormGradient( const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1); - const dim3 threads1(32,4,1); + dim3 threads1(warp_size,4,1); +#ifndef __HIP_PLATFORM_HCC__ + threads1.y = 2; +#endif int nshared = threads1.y > 1 ? threads1.y*threads1.x*sizeof(U) : diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 78645c236..f34a05ffe 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -22,6 +22,7 @@ from torch.nn.parameter import Parameter from torch.nn import init import importlib +from torch.nn import functional as F global fused_mix_prec_layer_norm_cuda fused_mix_prec_layer_norm_cuda = None @@ -84,7 +85,11 @@ def reset_parameters(self): def forward(self, input): - + # CPU path is here for unittest sake. + if not input.is_cuda: + print("WARNING! The input of FusedLayerNorm should be on the GPU." + "This warning should only be triggered in the FusedLayerNorm unit tests.") + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) return FusedLayerNormAffineFunction.apply( input, self.weight, self.bias, self.normalized_shape,self.eps) From f7ebcad3f94c090ce0a20d69628229e0f39a1f23 Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Wed, 7 Dec 2022 10:37:43 -0800 Subject: [PATCH 04/11] Fix generate_text.sh Megatron text-generation example working w/ DS inference (#99) This PR gets text-generation working in the `examples/generate_text.sh` example working w/ DS inference enabled. For the main fix, the `sample_sequence_batch` function has been updated to perform the softmax when calculating `log_probs`, instead of setting it to 1's using `torch.ones_like(...)`, although a few minor fixes were applied as well. Extra whitespace is also removed. --- examples/generate_text.sh | 11 ++++++----- megatron/checkpointing.py | 9 +++++---- megatron/text_generation_utils.py | 12 +++++------- tools/generate_samples_gpt.py | 16 ++++++++-------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/examples/generate_text.sh b/examples/generate_text.sh index 63501fc67..7e330ab8d 100755 --- a/examples/generate_text.sh +++ b/examples/generate_text.sh @@ -5,7 +5,7 @@ VOCAB_FILE=gpt2-vocab.json MERGE_FILE=gpt2-merges.txt b=8 mp=1 -experts=2 +experts=1 nodes=1 gpus=1 @@ -19,7 +19,7 @@ ds_inference="--ds-inference" launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus" L=24 -H=2048 +H=1024 A=16 #experts1=${experts[$k]} program_cmd="tools/generate_samples_gpt.py \ @@ -33,15 +33,16 @@ program_cmd="tools/generate_samples_gpt.py \ --num-experts ${experts} \ --mlp-type standard \ --micro-batch-size $b \ - --seq-length 10 \ - --out-seq-length 10 \ + --seq-length 1024 \ + --out-seq-length 1024 \ --temperature 1.0 \ --vocab-file $VOCAB_FILE \ --merge-file $MERGE_FILE \ --genfile unconditional_samples.json \ --top_p 0.9 \ --log-interval 1 \ - --num-samples $((100*$b)) + --num-samples 0 \ + --load $CHECKPOINT_PATH \ $use_tutel $ds_inference" echo $launch_cmd $program_cmd diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 8f6b9ec28..52c2a84da 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -137,7 +137,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler): for i in range(len(model)): mpu.set_virtual_pipeline_model_parallel_rank(i) state_dict['model%d' % i] = model[i].state_dict_for_save_checkpoint() - + # Optimizer stuff. if not args.no_save_optim: if optimizer is not None: @@ -169,7 +169,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler): # Saving is a collective communication checkpoint_name = get_checkpoint_name(args.save, iteration) - + # Trim off the filename and mp_rank_* directory. for _ in range(3): checkpoint_name = os.path.dirname(checkpoint_name) @@ -201,7 +201,8 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model): # specific to self attention so should work for cross attention as well while hasattr(model, 'module'): model = model.module - attention_module = model.language_model.encoder.layers[0].self_attention + #attention_module = model.language_model.encoder.layers[0].self_attention + attention_module = model.language_model.encoder.layers[0].attention hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition if num_splits_first: @@ -442,7 +443,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True def load_biencoder_checkpoint(model, only_query_model=False, only_context_model=False, custom_load_path=None): """ - selectively load retrieval models for indexing/retrieving + selectively load retrieval models for indexing/retrieving from saved checkpoints """ diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py index 7e81e5fae..adf04bcb4 100644 --- a/megatron/text_generation_utils.py +++ b/megatron/text_generation_utils.py @@ -191,7 +191,7 @@ def generate_samples_input_from_file(model): context_count += 1 # We added this function to support the tasks evaluation such as squad -# and drop in the https://github.com/EleutherAI/lm-evaluation-harness +# and drop in the https://github.com/EleutherAI/lm-evaluation-harness # codebase. The lm-evaluation-harness code can now call this function # similar to their current generate function call used for gpt style models. def generate_samples_eval(model, context, max_gen_length, eos_token_id): @@ -218,7 +218,7 @@ def generate_samples_eval(model, context, max_gen_length, eos_token_id): decode_tokens = decode_tokens[0].cpu().numpy().tolist() trim_decode_tokens = tokenizer.detokenize( decode_tokens)[raw_text_len:] - + return trim_decode_tokens @@ -416,9 +416,9 @@ def get_token_stream(model, context_tokens, model_latencies=[], single_token_lat batch_token_iterator = sample_sequence_batch(model, context_tokens_tensor, context_length_tensor, attention_mask, position_ids, model_latencies=model_latencies) - + count = 0 - + t0=time.time() for tokens, lengths in batch_token_iterator: if count > 1: @@ -559,9 +559,7 @@ def sample_sequence_batch(model, context_tokens, context_lengths, logits /= args.temperature logits = top_k_logits(logits, top_k=args.top_k, top_p=args.top_p) - log_probs = torch.ones_like(logits) - #TODO: Fix this - #log_probs = F.softmax(logits, dim=-1) + log_probs = F.softmax(logits, dim=-1) prev = torch.multinomial(log_probs, num_samples=1).view(-1) started = context_lengths <= context_length diff --git a/tools/generate_samples_gpt.py b/tools/generate_samples_gpt.py index 5df2c698e..bbd1164c4 100644 --- a/tools/generate_samples_gpt.py +++ b/tools/generate_samples_gpt.py @@ -41,7 +41,7 @@ def model_provider(pre_process=True, post_process=True): print_rank_0('building GPT model ...') model = GPTModel(num_tokentypes=0, parallel_output=False, - pre_process=pre_process, post_process=post_process, + pre_process=pre_process, post_process=post_process, return_moe_loss=False) # we need to set "return_moe_loss" for the inference_mode return model @@ -102,7 +102,7 @@ def print_latency(latency_set, title=""): print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000)) print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000)) print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000)) - + def main(): """Main program.""" latencies = [] @@ -115,7 +115,7 @@ def main(): 'no_load_optim': True}) args = get_args() - + if args.num_layers_per_virtual_pipeline_stage is not None: print("Interleaved pipeline schedule is not yet supported for text generation.") exit() @@ -142,8 +142,8 @@ def main(): generate_samples_interactive(model) else: generate_and_write_samples_unconditional(model, latencies, single_token_latency, model_latencies) - - + + #if torch.cuda.current_device() == 0: if torch.distributed.get_rank() == 0: print_latency(latencies) @@ -154,13 +154,13 @@ def main(): def ds_inference(model, args): import megatron.model as mm engine = deepspeed.init_inference(model=model, - mp_size=args.tensor_model_parallel_size, - mpu=mpu, + mp_size=args.tensor_model_parallel_size, + tensor_parallel={"mpu": mpu}, dtype=torch.half, replace_with_kernel_inject=True, moe_experts=args.num_experts, moe_type=args.mlp_type) - + return engine.module if __name__ == "__main__": From 5e8d578483d2f5ea963eec1d3ad1e797117b6706 Mon Sep 17 00:00:00 2001 From: Conglong Li Date: Mon, 12 Dec 2022 15:15:04 -0800 Subject: [PATCH 05/11] DeepSpeed Data Efficiency Library pretraining examples (#100) * staging_data_efficiency_v1 (#12) * refactor and clean * script refactor * fix * fix * fix * fix * refactor * script * CL diff type * script cleanup * fix for MP * refactor * refactor * fix * apply feedback --- examples/README.md | 5 + examples/bert_with_pile/prepare_pile_data.py | 4 +- examples/data_efficiency/README.md | 23 + examples/data_efficiency/analyze_data.py | 239 ++++++++ .../bert/ds_analyze_bert_data_map.sh | 67 +++ .../bert/ds_analyze_bert_data_reduce.sh | 66 +++ .../finetune/ds_config_bert_TEMPLATE.json | 24 + .../bert/finetune/ds_finetune_bert_mnli.sh | 150 +++++ .../bert/finetune/ds_finetune_bert_qqp.sh | 158 ++++++ .../bert/finetune/ds_finetune_bert_race.sh | 172 ++++++ .../finetune/ds_finetune_gather_result.py | 111 ++++ .../bert/pile_data_download_preprocess.py | 129 +++++ .../ds_config_bert_1clmetric_TEMPLATE.json | 74 +++ .../ds_config_bert_2clmetrics_TEMPLATE.json | 88 +++ .../ds_pretrain_bert_336M_base_script.sh | 472 ++++++++++++++++ .../pretrain/ds_pretrain_bert_336M_run.sh | 241 ++++++++ .../gpt/ds_analyze_gpt_data_map.sh | 70 +++ .../gpt/ds_analyze_gpt_data_reduce.sh | 69 +++ .../gpt/eval/ds_config_eval_dummy.json | 28 + .../gpt/eval/ds_evalharness_1gpu.sh | 77 +++ .../gpt/eval/ds_evalharness_gather_result.py | 358 ++++++++++++ .../gpt/eval/ds_evalharness_parallel_run.sh | 66 +++ .../ds_evalharness_parallel_run_10shot.sh | 61 +++ .../ds_config_gpt_1clmetric_TEMPLATE.json | 74 +++ .../ds_config_gpt_2clmetrics_TEMPLATE.json | 88 +++ .../ds_pretrain_gpt_1.3B_dense_base_script.sh | 515 ++++++++++++++++++ .../ds_pretrain_gpt_1.3B_dense_run.sh | 366 +++++++++++++ megatron/arguments.py | 21 +- megatron/data/bert_dataset.py | 6 +- megatron/data/dataset_utils.py | 49 +- megatron/data/gpt_dataset.py | 23 +- megatron/initialize.py | 11 +- megatron/model/gpt_model.py | 2 +- megatron/model/language_model.py | 11 +- megatron/model/transformer.py | 6 +- megatron/model/utils.py | 2 +- megatron/training.py | 151 ++++- megatron/utils.py | 5 +- pretrain_bert.py | 32 +- pretrain_gpt.py | 42 +- tasks/eval_harness/evaluate.py | 5 +- 41 files changed, 4087 insertions(+), 74 deletions(-) create mode 100644 examples/data_efficiency/README.md create mode 100644 examples/data_efficiency/analyze_data.py create mode 100644 examples/data_efficiency/bert/ds_analyze_bert_data_map.sh create mode 100644 examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh create mode 100644 examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json create mode 100644 examples/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh create mode 100644 examples/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh create mode 100644 examples/data_efficiency/bert/finetune/ds_finetune_bert_race.sh create mode 100644 examples/data_efficiency/bert/finetune/ds_finetune_gather_result.py create mode 100644 examples/data_efficiency/bert/pile_data_download_preprocess.py create mode 100644 examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json create mode 100644 examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json create mode 100644 examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh create mode 100644 examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh create mode 100644 examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh create mode 100644 examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh create mode 100644 examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json create mode 100644 examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh create mode 100644 examples/data_efficiency/gpt/eval/ds_evalharness_gather_result.py create mode 100644 examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh create mode 100644 examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh create mode 100644 examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json create mode 100644 examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json create mode 100644 examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh create mode 100644 examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh diff --git a/examples/README.md b/examples/README.md index 80b1d4488..a130a5312 100644 --- a/examples/README.md +++ b/examples/README.md @@ -12,9 +12,14 @@ If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS bas Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models and dense models. These recipes are for GPT-style NLG models. +### Data Efficiency + +The ```data_efficiency``` folder includes GPT-3 and BERT pretraining examples for DeepSpeed Data Efficiency Library. Please refer to the detailed tutorials in data_efficiency/README.MD. + ### Curriculum Learning Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. These recipes are for GPT-style NLG models. +Note that the DeepSpeed Data Efficiency Library above includes a more general curriculum learning support. This legacy curriculum learning feature is still compatible, but we recommend using the DeepSpeed Data Efficiency Library above. ### Model Compression diff --git a/examples/bert_with_pile/prepare_pile_data.py b/examples/bert_with_pile/prepare_pile_data.py index d5eed96d5..953d5966d 100644 --- a/examples/bert_with_pile/prepare_pile_data.py +++ b/examples/bert_with_pile/prepare_pile_data.py @@ -2,9 +2,9 @@ import sys import time import os - import sys -sys.path.insert(1, '../../') +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir,os.path.pardir))) from megatron.data import indexed_dataset def pile_download(download_url, file_path, i): diff --git a/examples/data_efficiency/README.md b/examples/data_efficiency/README.md new file mode 100644 index 000000000..e10db17d5 --- /dev/null +++ b/examples/data_efficiency/README.md @@ -0,0 +1,23 @@ +This directory includes GPT-3/BERT pretraining example scripts for DeepSpeed Data Efficiency Library technologies (curriculum learning, random-LTD, and the two composed together). + +You need to install updated DeepSpeed version (>=0.8.0), which contains the DeepSpeed Data Efficiency Library. + +Additional tutorial can be found at [DeepSpeed website](https://www.deepspeed.ai/tutorials/data-efficiency/). + +Additional technical details can be found in our [random-LTD paper](https://arxiv.org/abs/2211.11586) and [data efficiency paper](https://arxiv.org/abs/2212.03597). + +## GPT-3 pretraining and evaluation +Inside ``gpt`` folder, first the ``ds_analyze_gpt_data_map.sh`` and ``ds_analyze_gpt_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing. + +``gpt/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_gpt_1.3B_dense_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality. + +``gpt/eval`` includes the zero-/few-shot evaluation example scripts. ``ds_evalharness_parallel_run.sh`` is for zero-shot, and ``ds_evalharness_parallel_run_10shot.sh`` is for 10-shot. + +## BERT pretraining and finetuning +Inside ``bert`` folder, first the ``pile_data_download_preprocess.py`` can be used to download and preprocess the public Pile dataset. + +The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing. + +``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality. + +``bert/finetune`` includes the finetuning example scripts. \ No newline at end of file diff --git a/examples/data_efficiency/analyze_data.py b/examples/data_efficiency/analyze_data.py new file mode 100644 index 000000000..36002d46d --- /dev/null +++ b/examples/data_efficiency/analyze_data.py @@ -0,0 +1,239 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +Copyright 2022 The Microsoft DeepSpeed Team +''' + +import os +import time +import sys +import math +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir,os.path.pardir))) +from datetime import datetime +import numpy as np +import torch + +from deepspeed.runtime.data_pipeline.data_sampling.data_analyzer \ + import DataAnalyzer +from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset \ + import MMapIndexedDataset + +from megatron import get_args +from megatron import print_rank_0 +from megatron.initialize import initialize_megatron + +def get_tasks_args(parser): + """Provide extra arguments required for data analyzing.""" + group = parser.add_argument_group(title='data_analyzing') + + group.add_argument('--analyzing-task', type=str, required=True, + default=None, + choices=['map', + 'reduce'], + help='What type of analyzing task to perform.') + group.add_argument('--analyzing-data-type', type=str, required=True, + default=None, + choices=['BERT', + 'GPT'], + help='What type of data.') + group.add_argument('--analyzing-metric', type=str, nargs='+', default=[], + help='What kinds of metrics to analyze.') + group.add_argument('--analyzing-num-workers', type=int, default=1, + help='Number of workers. Each worker could be a single CPU node.') + group.add_argument('--analyzing-worker-id', type=int, default=0, + help='Worker id of current node.') + group.add_argument('--analyzing-num-threads', type=int, default=1, + help='Number of threads for each worker.') + group.add_argument('--analyzing-num-threads-reduce', type=int, default=1, + help='Number of threads for each worker.') + group.add_argument('--analyzing-specific-threads', type=int, nargs='+', default=[], + help='Which specific threads to run. Helpful when there are specific thread failed in previous run.') + return parser + +def train_valid_test_datasets_provider_gpt(): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0('> building train, validation, and test datasets ' + 'for GPT ...') + from megatron.data.gpt_dataset import build_train_valid_test_datasets + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + +def train_valid_test_datasets_provider_bert(): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0('> building train, validation, and test datasets ' + 'for BERT ...') + from megatron.data.dataset_utils import build_train_valid_test_datasets + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them + max_seq_length=args.seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + binary_head=args.bert_binary_head) + print_rank_0("> finished creating BERT datasets ...") + + return train_ds, valid_ds, test_ds + +def metric_seqlen(data): + metric = torch.count_nonzero(data['padding_mask'], dim=1) + return metric + +def metric_total_vocab_freq(data): + args = get_args() + if args.analyzing_data_type == 'BERT': + frequency = torch.bincount(data['text'].view(-1), + minlength=args.padded_vocab_size+1, + weights=data['padding_mask'].view(-1)) + elif args.analyzing_data_type == 'GPT': + frequency = torch.bincount(data['text'].view(-1), + minlength=args.padded_vocab_size+1) + return frequency + +def metric_vocab_rarity(data): + args = get_args() + if args.analyzing_data_type == 'BERT': + rarity = torch.sum(data['padding_mask'] * \ + args.total_vocab_freq[data['text']], dim=1).to(torch.long) + elif args.analyzing_data_type == 'GPT': + rarity = [] + # Do one by one to avoid too high memory consumption + for row in range(data['text'].size()[0]): + rarity.append(int(torch.sum(args.total_vocab_freq[data['text'][row]]).item())) + rarity = torch.tensor(rarity, dtype=torch.long) + print(f"rarity min {min(rarity)}, max {max(rarity)}, len {len(rarity)}, avg {sum(rarity)/len(rarity)}") + return rarity + +def metric_seqlen_vocab_rarity(data): + args = get_args() + metric = torch.count_nonzero(data['padding_mask'], dim=1).to(torch.long) * args.seqlen_coeff + metric += torch.sum(data['padding_mask'] * \ + args.total_vocab_freq[data['text']], dim=1).to(torch.long) + print(f"metric min {min(metric)}, max {max(metric)}, len {len(metric)}, avg {sum(metric)/len(metric)}") + return metric + +def get_metric_function(metric_name): + if metric_name == 'seqlen': + return metric_seqlen + if metric_name == 'total_vocab_freq': + return metric_total_vocab_freq + if metric_name == 'vocab_rarity': + return metric_vocab_rarity + if metric_name == 'seqlen_vocab_rarity': + return metric_seqlen_vocab_rarity + +def get_metric_type(metric_name): + if metric_name == 'seqlen': + return 'single_value_per_sample' + if metric_name == 'total_vocab_freq': + return 'accumulate_value_over_samples' + if metric_name == 'vocab_rarity': + return 'single_value_per_sample' + if metric_name == 'seqlen_vocab_rarity': + return 'single_value_per_sample' + +def run_map(): + args = get_args() + if args.analyzing_data_type == 'BERT': + args.mask_prob = 0 # When analyzing data, we don't want any mask. + train_ds, _, _ = train_valid_test_datasets_provider_bert() + elif args.analyzing_data_type == 'GPT': + train_ds, _, _ = train_valid_test_datasets_provider_gpt() + assert 'seqlen' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.' + assert 'seqlen_vocab_rarity' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.' + if 'vocab_rarity' in args.analyzing_metric or 'seqlen_vocab_rarity' in args.analyzing_metric: + total_vocab_freq_fname = f"{args.save}/total_vocab_freq/total_vocab_freq_metric_value" + assert os.path.isfile(f"{total_vocab_freq_fname}.bin") and os.path.isfile(f"{total_vocab_freq_fname}.idx"), "To analyze vocab rarity, first need to analyze the total vocab freq." + total_vocab_freq = MMapIndexedDataset(total_vocab_freq_fname, skip_warmup=True) + total_vocab_freq = np.copy(total_vocab_freq[0]) + total_vocab_freq[total_vocab_freq == 0] = 1 # Avoid log(0) error + total_vocab_freq = np.log(total_vocab_freq/sum(total_vocab_freq)) * -1 + args.total_vocab_freq = torch.tensor(total_vocab_freq, dtype=torch.double) + if 'seqlen_vocab_rarity' in args.analyzing_metric: + # Use large coeff to make seqlen dominates vocab_rarity + max_possible_rarity = args.seq_length * torch.max(args.total_vocab_freq).item() + args.seqlen_coeff = 10 ** (math.ceil(math.log(max_possible_rarity, 10)) + 1) + print(f"Metric seqlen_vocab_rarity: using {args.seqlen_coeff} as coefficient for seqlen.") + metric_functions = [get_metric_function(x) for x in args.analyzing_metric] + metric_types = [get_metric_type(x) for x in args.analyzing_metric] + # For metric_dtypes we int64 by default since it could be hard to estimate + # the appropriate dtype before the mapping analysis. During reduce where + # we merge the analysis results, the DataAnalyzer will automatically choose + # the dtype of merged result file as the smallest one that meet the range + # requirement. + metric_dtypes = [np.int64 for x in args.analyzing_metric] + start = time.time() + data_analyzer = DataAnalyzer(train_ds, + num_workers=args.analyzing_num_workers, + worker_id=args.analyzing_worker_id, + num_threads=args.analyzing_num_threads, + specific_threads=args.analyzing_specific_threads, + batch_size=args.global_batch_size, metric_names=args.analyzing_metric, + metric_functions=metric_functions, metric_types=metric_types, + metric_dtypes=metric_dtypes, save_path=args.save) + data_analyzer.run_map() + duration = (time.time() - start) / 3600.0 + print(f"map job finished in {duration} hr.") + +def run_reduce(): + args = get_args() + if args.analyzing_data_type == 'BERT': + args.mask_prob = 0 # When analyzing data, we don't want any mask. + train_ds, _, _ = train_valid_test_datasets_provider_bert() + elif args.analyzing_data_type == 'GPT': + train_ds, _, _ = train_valid_test_datasets_provider_gpt() + metric_functions = [get_metric_function(x) for x in args.analyzing_metric] + metric_types = [get_metric_type(x) for x in args.analyzing_metric] + metric_dtypes = [np.int64 for x in args.analyzing_metric] + start = time.time() + data_analyzer = DataAnalyzer(train_ds, + num_workers=args.analyzing_num_workers, + num_threads=args.analyzing_num_threads, + num_threads_reduce=args.analyzing_num_threads_reduce, + batch_size=args.global_batch_size, metric_names=args.analyzing_metric, + metric_functions=metric_functions, metric_types=metric_types, + metric_dtypes=metric_dtypes, save_path=args.save) + data_analyzer.run_reduce() + duration = (time.time() - start) / 3600.0 + print(f"reduce job finished in {duration} hr.") + +if __name__ == "__main__": + initialize_megatron(extra_args_provider=get_tasks_args, allow_no_cuda=True) + args = get_args() + if args.analyzing_task == 'map': + run_map() + elif args.analyzing_task == 'reduce': + run_reduce() + else: + raise NotImplementedError('Task {} is not implemented.'.format( + args.analyzing_task)) diff --git a/examples/data_efficiency/bert/ds_analyze_bert_data_map.sh b/examples/data_efficiency/bert/ds_analyze_bert_data_map.sh new file mode 100644 index 000000000..7f23e3615 --- /dev/null +++ b/examples/data_efficiency/bert/ds_analyze_bert_data_map.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +num_workers=1 # Num nodes to run the map job +num_threads=40 # Num threads on each node. Set this based on #CPU cores + +# If different data epochs have slightly different data samples (e.g., due +# to randomness), then you need to specify large enough num_epochs that cover +# whole pretraining. If different data epochs are the same, set num_epochs to +# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency +# library will automatically handle reshuffling when reaching another epoch. +num_epochs=5 + +# Which node is this node (start with 0 and end with num_workers-1). This +# script only launch the map job on 1 worker node, since we don't expect +# running on many nodes and workers don't need any communication. But you +# can modify this script to add a MPI/torch distributed launcher. +worker_id=$1 +save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/" + +metric='total_vocab_freq' +# metric='vocab_rarity' # this requires the result of total_vocab_freq +# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq +# metric='seqlen' + +seq_len=512 +batch_size=10000 + +jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}" +## Public the Pile dataset, see prepare_pile_data.py in the same directory +## about how to download and preprocess the data. +## Change data_home to your own training data path. +# data_home="/vc_data_blob/users/conglli/the_pile_bert" +data_home="/blob/data/the_pile_bert" +data_path="${data_home}/pile_bert_train_text_sentence" + +vocab_path="bert-large-uncased-vocab.txt" +if [ ! -f "$vocab_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt +fi + +# Make sure the "--split" is the same as what you will use for pre-training. +options=" \ + --analyzing-task map \ + --analyzing-data-type BERT \ + --analyzing-metric ${metric} \ + --analyzing-num-workers ${num_workers} \ + --analyzing-worker-id ${worker_id} \ + --analyzing-num-threads ${num_threads} \ + --vocab-file ${vocab_path} \ + --data-path ${data_path} \ + --data-impl mmap \ + --tokenizer-type BertWordPieceLowerCase \ + --micro-batch-size ${batch_size} \ + --global-batch-size ${batch_size} \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --num-layers 1 \ + --hidden-size 1 \ + --num-attention-heads 1 \ + --split 949,50,1 \ + --distributed-backend gloo \ + --train-data-exact-num-epochs ${num_epochs} \ + --return-data-index \ + --save-interval 1 \ + --save ${save_path}" + +python ../analyze_data.py ${options} &> ${jobname}.log \ No newline at end of file diff --git a/examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh b/examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh new file mode 100644 index 000000000..f0d14df96 --- /dev/null +++ b/examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Set these 2 to the same as what you used during map job. We need these 2 +# configs to know how many map job result files do we have. +num_workers=1 +num_threads=40 +# Reduce job only has 1 worker but can accelerate by multithreading. +num_threads_reduce=40 + +# If different data epochs have slightly different data samples (e.g., due +# to randomness), then you need to specify large enough num_epochs that cover +# whole pretraining. If different data epochs are the same, set num_epochs to +# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency +# library will automatically handle reshuffling when reaching another epoch. +num_epochs=5 + +save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/" + +metric='total_vocab_freq' +# metric='vocab_rarity' # this requires the result of total_vocab_freq +# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq +# metric='seqlen' + +seq_len=512 +batch_size=10000 + +jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-reduce" +## Public the Pile dataset, see prepare_pile_data.py in the same directory +## about how to download and preprocess the data. +## Change data_home to your own training data path. +# data_home="/vc_data_blob/users/conglli/the_pile_bert" +data_home="/blob/data/the_pile_bert" +data_path="${data_home}/pile_bert_train_text_sentence" + +vocab_path="bert-large-uncased-vocab.txt" +if [ ! -f "$vocab_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt +fi + +# Make sure the "--split" is the same as what you will use for pre-training. +options=" \ + --analyzing-task reduce \ + --analyzing-data-type BERT \ + --analyzing-metric ${metric} \ + --analyzing-num-workers ${num_workers} \ + --analyzing-num-threads ${num_threads} \ + --analyzing-num-threads-reduce ${num_threads_reduce} \ + --vocab-file ${vocab_path} \ + --data-path ${data_path} \ + --data-impl mmap \ + --tokenizer-type BertWordPieceLowerCase \ + --micro-batch-size ${batch_size} \ + --global-batch-size ${batch_size} \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --num-layers 1 \ + --hidden-size 1 \ + --num-attention-heads 1 \ + --split 949,50,1 \ + --distributed-backend gloo \ + --train-data-exact-num-epochs ${num_epochs} \ + --return-data-index \ + --save-interval 1 \ + --save ${save_path}" + +python ../analyze_data.py ${options} &> ${jobname}.log \ No newline at end of file diff --git a/examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json b/examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json new file mode 100644 index 000000000..2700805d1 --- /dev/null +++ b/examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json @@ -0,0 +1,24 @@ +{ + "train_batch_size" : CONFIG_BATCH_SIZE, + "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, + "steps_per_print": LOG_INTERVAL, + + "zero_optimization": { + "stage": ZERO_STAGE, + "elastic_checkpoint": true + }, + + "gradient_clipping": 1.0, + "prescale_gradients": PRESCALE_GRAD, + + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 11 + }, + + "wall_clock_breakdown" : false +} diff --git a/examples/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh b/examples/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh new file mode 100644 index 000000000..e88f7beb0 --- /dev/null +++ b/examples/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh @@ -0,0 +1,150 @@ +seed=1234 +pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp" + +############################################################################### +### Main configs +### The main configs are from Megatron-LM paper +### https://arxiv.org/abs/1909.08053. Choose based on your desired model size +### or build your own configs. +seq_len=512 + +## From Table 6 in https://arxiv.org/abs/1909.08053. +task="MNLI" +global_batch_size=128 +lr=1e-5 +epochs=10 + +train_data="/blob/data/GlueData/MNLI/train.tsv" +valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \ + /blob/data/GlueData/MNLI/dev_mismatched.tsv" + +## Adjust based on number of GPUs. +batch_size=16 + +## BERT 110M (same config as original BERT-Base model) +## This config is not included in Megatron-LM paper +# model_size=0.11 +# num_layers=12 +# hidden_size=768 +# num_attn_heads=12 + +## BERT 336M (same config as original BERT-Large model) +model_size=0.336 +num_layers=24 +hidden_size=1024 +num_attn_heads=16 + +## BERT 1.3B +# model_size=1.3 +# num_layers=24 +# hidden_size=2048 +# num_attn_heads=32 + +## BERT 3.9B +# model_size=3.9 +# num_layers=48 +# hidden_size=2560 +# num_attn_heads=40 +############################################################################### +### Parallelism configs +## Model parallelism, 1 is no MP +mp_size=1 + +## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true. +## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's +## pipeline parallelism is only integrated with the GPT case, and currently +## DeepSpeed is not integrated with Megatron's own pipeline parallelism. +pp_size=1 +no_pp="true" + +## ZeRO stage +zero_stage=0 +############################################################################### +### Misc configs +log_interval=10 +eval_iters=50 +eval_interval=100 +save_interval=500000 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" +############################################################################### +vocab_file="bert-large-uncased-vocab.txt" +if [ ! -f "$vocab_file" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt +fi + +jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}" +checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}" +mkdir -p ${checkpoint_path} + +template_json="ds_config_bert_TEMPLATE.json" +config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json" +if [[ $zero_stage -gt 0 ]]; then +sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \ + | sed "s/CONFIG_MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/false/" \ + | sed "s/CONFIG_FP16_ENABLED/true/" \ + | sed "s/CONFIG_BF16_ENABLED/false/" \ + > ${config_json} +else +sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \ + | sed "s/CONFIG_MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/true/" \ + | sed "s/CONFIG_FP16_ENABLED/true/" \ + | sed "s/CONFIG_BF16_ENABLED/false/" \ + > ${config_json} +fi + +options=" \ + --finetune \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${zero_stage} \ + --task ${task} \ + --seed ${seed} \ + --train-data ${train_data} \ + --valid-data ${valid_data} \ + --tokenizer-type BertWordPieceLowerCase \ + --vocab-file ${vocab_file} \ + --epochs ${epochs} \ + --pretrained-checkpoint ${pretrained_checkpoint} \ + --tensor-model-parallel-size ${mp_size} \ + --pipeline-model-parallel-size ${pp_size} \ + --num-layers ${num_layers} \ + --hidden-size ${hidden_size} \ + --num-attention-heads ${num_attn_heads} \ + --global-batch-size ${global_batch_size} \ + --micro-batch-size ${batch_size} \ + --lr ${lr} \ + --lr-decay-style linear \ + --lr-warmup-fraction 0.065 \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --save-interval ${save_interval} \ + --save ${checkpoint_path} \ + --log-interval ${log_interval} \ + --eval-interval ${eval_interval} \ + --eval-iters ${eval_iters} \ + --weight-decay 1.0e-1 \ + --fp16" + +if [ "${activation_checkpoint}" = "true" ]; then +options="${options} \ + --checkpoint-activations \ + --deepspeed-activation-checkpointing" +fi + +if [[ "${no_pp}" = "true" ]]; then +options="${options} \ + --no-pipeline-parallel" +fi + +# After the fine-tuning finishes, you can find the dev set accuracy numbers by +# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log" +deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log diff --git a/examples/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh b/examples/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh new file mode 100644 index 000000000..8083e1024 --- /dev/null +++ b/examples/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh @@ -0,0 +1,158 @@ +seed=1234 +pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp" + +############################################################################### +### Main configs +### The main configs are from Megatron-LM paper +### https://arxiv.org/abs/1909.08053. Choose based on your desired model size +### or build your own configs. +seq_len=512 + +## From Table 6 in https://arxiv.org/abs/1909.08053. +task="QQP" + +train_data="/blob/data/GlueData/QQP/train.tsv" +valid_data="/blob/data/GlueData/QQP/dev.tsv" + +## Adjust based on number of GPUs. +batch_size=16 + +## BERT 110M (same config as original BERT-Base model) +## This config is not included in Megatron-LM paper +# model_size=0.11 +# num_layers=12 +# hidden_size=768 +# num_attn_heads=12 +# global_batch_size=128 +# lr=5e-5 +# epochs=12 + +## BERT 336M (same config as original BERT-Large model) +model_size=0.336 +num_layers=24 +hidden_size=1024 +num_attn_heads=16 +global_batch_size=128 +lr=5e-5 +epochs=12 + +## BERT 1.3B +# model_size=1.3 +# num_layers=24 +# hidden_size=2048 +# num_attn_heads=32 +# global_batch_size=128 +# lr=3e-5 +# epochs=12 + +## BERT 3.9B +# model_size=3.9 +# num_layers=48 +# hidden_size=2560 +# num_attn_heads=40 +# global_batch_size=256 +# lr=4e-5 +# epochs=12 +############################################################################### +### Parallelism configs +## Model parallelism, 1 is no MP +mp_size=1 + +## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true. +## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's +## pipeline parallelism is only integrated with the GPT case, and currently +## DeepSpeed is not integrated with Megatron's own pipeline parallelism. +pp_size=1 +no_pp="true" + +## ZeRO stage +zero_stage=0 +############################################################################### +### Misc configs +log_interval=10 +eval_iters=50 +eval_interval=100 +save_interval=500000 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" +############################################################################### +vocab_file="bert-large-uncased-vocab.txt" +if [ ! -f "$vocab_file" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt +fi + +jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}" +checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}" +mkdir -p ${checkpoint_path} + +template_json="ds_config_bert_TEMPLATE.json" +config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json" +if [[ $zero_stage -gt 0 ]]; then +sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \ + | sed "s/CONFIG_MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/false/" \ + | sed "s/CONFIG_FP16_ENABLED/true/" \ + | sed "s/CONFIG_BF16_ENABLED/false/" \ + > ${config_json} +else +sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \ + | sed "s/CONFIG_MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/true/" \ + | sed "s/CONFIG_FP16_ENABLED/true/" \ + | sed "s/CONFIG_BF16_ENABLED/false/" \ + > ${config_json} +fi + +options=" \ + --finetune \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${zero_stage} \ + --task ${task} \ + --seed ${seed} \ + --train-data ${train_data} \ + --valid-data ${valid_data} \ + --tokenizer-type BertWordPieceLowerCase \ + --vocab-file ${vocab_file} \ + --epochs ${epochs} \ + --pretrained-checkpoint ${pretrained_checkpoint} \ + --tensor-model-parallel-size ${mp_size} \ + --pipeline-model-parallel-size ${pp_size} \ + --num-layers ${num_layers} \ + --hidden-size ${hidden_size} \ + --num-attention-heads ${num_attn_heads} \ + --global-batch-size ${global_batch_size} \ + --micro-batch-size ${batch_size} \ + --lr ${lr} \ + --lr-decay-style linear \ + --lr-warmup-fraction 0.065 \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --save-interval ${save_interval} \ + --save ${checkpoint_path} \ + --log-interval ${log_interval} \ + --eval-interval ${eval_interval} \ + --eval-iters ${eval_iters} \ + --weight-decay 1.0e-1 \ + --fp16" + +if [ "${activation_checkpoint}" = "true" ]; then +options="${options} \ + --checkpoint-activations \ + --deepspeed-activation-checkpointing" +fi + +if [[ "${no_pp}" = "true" ]]; then +options="${options} \ + --no-pipeline-parallel" +fi + +# After the fine-tuning finishes, you can find the dev set accuracy numbers by +# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log" +deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log diff --git a/examples/data_efficiency/bert/finetune/ds_finetune_bert_race.sh b/examples/data_efficiency/bert/finetune/ds_finetune_bert_race.sh new file mode 100644 index 000000000..15658e3d2 --- /dev/null +++ b/examples/data_efficiency/bert/finetune/ds_finetune_bert_race.sh @@ -0,0 +1,172 @@ +seed=1234 +## RACE have two sub-tasks that need to be finetuned separately +difficulty="middle" +# difficulty="high" +pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp" + +############################################################################### +### Main configs +### The main configs are from Megatron-LM paper +### https://arxiv.org/abs/1909.08053. Choose based on your desired model size +### or build your own configs. +seq_len=512 + +## From Table 6 in https://arxiv.org/abs/1909.08053. +task="RACE" + +## Race dataset can be downloaded by: +## wget http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz +train_data="/blob/data/RACE/train/${difficulty}" + +## The Megatron paper https://arxiv.org/abs/1909.08053 says: "For the test set +## results of RACE, we first use the development set to find the checkpoint +## that gives us the median score on the 5 random seeds and we report the +## results from that checkpoint on the test set", which is a quite confusing +## description. For simplicity, instead we directly get the median dev and test +## set score on 5 random seeds from a single pretrained_checkpoint. +valid_data="/blob/data/RACE/dev/${difficulty} \ + /blob/data/RACE/test/${difficulty}" + +## Adjust based on number of GPUs. +batch_size=4 + +## BERT 110M (same config as original BERT-Base model) +## This config is not included in Megatron-LM paper +# model_size=0.11 +# num_layers=12 +# hidden_size=768 +# num_attn_heads=12 +# global_batch_size=32 +# lr=2e-5 +# epochs=3 + +## BERT 336M (same config as original BERT-Large model) +model_size=0.336 +num_layers=24 +hidden_size=1024 +num_attn_heads=16 +global_batch_size=32 +lr=2e-5 +epochs=3 + +## BERT 1.3B +# model_size=1.3 +# num_layers=24 +# hidden_size=2048 +# num_attn_heads=32 +# global_batch_size=16 +# lr=1e-5 +# epochs=3 + +## BERT 3.9B +# model_size=3.9 +# num_layers=48 +# hidden_size=2560 +# num_attn_heads=40 +# global_batch_size=32 +# lr=2e-5 +# epochs=3 +############################################################################### +### Parallelism configs +## Model parallelism, 1 is no MP +mp_size=1 + +## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true. +## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's +## pipeline parallelism is only integrated with the GPT case, and currently +## DeepSpeed is not integrated with Megatron's own pipeline parallelism. +pp_size=1 +no_pp="true" + +## ZeRO stage +zero_stage=0 +############################################################################### +### Misc configs +log_interval=10 +eval_iters=50 +eval_interval=100 +save_interval=100000 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" +############################################################################### +vocab_file="bert-large-uncased-vocab.txt" +if [ ! -f "$vocab_file" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt +fi + +jobname="${task}-${difficulty}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}" +checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}" +mkdir -p ${checkpoint_path} + +template_json="ds_config_bert_TEMPLATE.json" +config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json" +if [[ $zero_stage -gt 0 ]]; then +sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \ + | sed "s/CONFIG_MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/false/" \ + | sed "s/CONFIG_FP16_ENABLED/true/" \ + | sed "s/CONFIG_BF16_ENABLED/false/" \ + > ${config_json} +else +sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \ + | sed "s/CONFIG_MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/true/" \ + | sed "s/CONFIG_FP16_ENABLED/true/" \ + | sed "s/CONFIG_BF16_ENABLED/false/" \ + > ${config_json} +fi + +options=" \ + --finetune \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${zero_stage} \ + --task ${task} \ + --seed ${seed} \ + --train-data ${train_data} \ + --valid-data ${valid_data} \ + --tokenizer-type BertWordPieceLowerCase \ + --vocab-file ${vocab_file} \ + --epochs ${epochs} \ + --pretrained-checkpoint ${pretrained_checkpoint} \ + --tensor-model-parallel-size ${mp_size} \ + --pipeline-model-parallel-size ${pp_size} \ + --num-layers ${num_layers} \ + --hidden-size ${hidden_size} \ + --num-attention-heads ${num_attn_heads} \ + --global-batch-size ${global_batch_size} \ + --micro-batch-size ${batch_size} \ + --lr ${lr} \ + --lr-decay-style linear \ + --lr-warmup-fraction 0.06 \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --save-interval ${save_interval} \ + --save ${checkpoint_path} \ + --log-interval ${log_interval} \ + --eval-interval ${eval_interval} \ + --eval-iters ${eval_iters} \ + --weight-decay 1.0e-1 \ + --clip-grad 1.0 \ + --fp16" + +if [ "${activation_checkpoint}" = "true" ]; then +options="${options} \ + --checkpoint-activations \ + --deepspeed-activation-checkpointing" +fi + +if [[ "${no_pp}" = "true" ]]; then +options="${options} \ + --no-pipeline-parallel" +fi + +# After the fine-tuning finishes, you can find the dev/test set accuracy numbers +# by "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log" +deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log diff --git a/examples/data_efficiency/bert/finetune/ds_finetune_gather_result.py b/examples/data_efficiency/bert/finetune/ds_finetune_gather_result.py new file mode 100644 index 000000000..6fffe829d --- /dev/null +++ b/examples/data_efficiency/bert/finetune/ds_finetune_gather_result.py @@ -0,0 +1,111 @@ +import os +import statistics + +def gather_numbers(fname, match_keywords, index_keywords, index_offsets): + results = {} + for k in index_keywords: + results[k] = [] + file1 = open(fname, 'r') + while True: + line = file1.readline() + if not line: + break + splits = line.split(' ') + for i in range(len(match_keywords)): + if match_keywords[i] in line: + ref_idx = splits.index(index_keywords[i]) + results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]])) + file1.close() + return results + +def gather_MNLI_results(result_path): + overall = [] + matched = [] + mismatched = [] + for file in os.listdir(result_path): + if file.startswith('MNLI'): + fname = f'{result_path}/{file}/output.log' + if os.path.exists(fname): + results = gather_numbers(fname, + ['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'], + ['overall:', 'dev-matched:', 'dev-mismatched:'], + [9, 9, 9]) + overall_candidate = results['overall:'] + matched_candidate = results['dev-matched:'] + mismatched_candidate = results['dev-mismatched:'] + if len(overall_candidate) > 0: + assert len(overall_candidate) == len(matched_candidate) and len(overall_candidate) == len(mismatched_candidate) + best_index = overall_candidate.index(max(overall_candidate)) + overall.append(overall_candidate[best_index]) + matched.append(matched_candidate[best_index]) + mismatched.append(mismatched_candidate[best_index]) + if len(overall) > 0: + if len(overall) % 2 == 1: + median_idx = overall.index(statistics.median(overall)) + else: + median_idx = overall.index(statistics.median_high(overall)) + print(f'MNLI how Megatron paper reported: overall results median {statistics.median(overall)}, corresponding matched/mismatched: {matched[median_idx]}/{mismatched[median_idx]}') + print(f'MNLI other results:') + print(f'MNLI overall results {overall}, median {statistics.median(overall)} (corresponding matched/mismatched {matched[median_idx]}/{mismatched[median_idx]}), mean {statistics.mean(overall)}, std {statistics.stdev(overall)}') + print(f'MNLI matched results {matched}, median {statistics.median(matched)}, mean {statistics.mean(matched)}, std {statistics.stdev(matched)}') + print(f'MNLI mismatched results {mismatched}, median {statistics.median(mismatched)}, mean {statistics.mean(mismatched)}, std {statistics.stdev(mismatched)}') + else: + print("Didn't find any MNLI result") + +def gather_QQP_results(result_path): + overall = [] + for file in os.listdir(result_path): + if file.startswith('QQP'): + fname = f'{result_path}/{file}/output.log' + if os.path.exists(fname): + results = gather_numbers(fname, ['overall:'], ['overall:'], [9]) + overall_candidate = results['overall:'] + if len(overall_candidate) > 0: + best_index = overall_candidate.index(max(overall_candidate)) + overall.append(overall_candidate[best_index]) + if len(overall) > 0: + print(f'QQP how Megatron paper reported: overall results median {statistics.median(overall)}') + print(f'QQP other results:') + print(f'QQP overall results {overall}, median {statistics.median(overall)}, mean {statistics.mean(overall)}, std {statistics.stdev(overall)}') + else: + print("Didn't find any QQP result") + +def gather_RACE_results(result_path, task): + dev = [] + test = [] + for file in os.listdir(result_path): + if file.startswith(f'RACE-{task}'): + fname = f'{result_path}/{file}/output.log' + if os.path.exists(fname): + results = gather_numbers(fname, + [f'metrics for dev-{task}:', f'metrics for test-{task}:'], + [f'dev-{task}:', f'test-{task}:'], + [9, 9]) + dev_candidate = results[f'dev-{task}:'] + test_candidate = results[f'test-{task}:'] + if len(dev_candidate) > 0: + assert len(dev_candidate) == len(test_candidate) + dev.append(max(dev_candidate)) + test.append(max(test_candidate)) + if len(dev) > 0: + if len(dev) % 2 == 1: + median_idx = dev.index(statistics.median(dev)) + else: + median_idx = dev.index(statistics.median_high(dev)) + print(f'RACE-{task} how Megatron paper reported: test result from the median of dev results {test[median_idx]}') + print(f'RACE-{task} other results:') + print(f'RACE-{task} dev results {dev}, median {statistics.median(dev)}, mean {statistics.mean(dev)}, std {statistics.stdev(dev)}') + print(f'RACE-{task} test results {test}, median {statistics.median(test)}, mean {statistics.mean(test)}, std {statistics.stdev(test)}') + else: + print(f"Didn't find any RACE-{task} result") + +def gather_finetune_results(result_path): + print(f'Gather finetune results for {result_path}') + gather_MNLI_results(result_path) + gather_QQP_results(result_path) + gather_RACE_results(result_path, 'middle') + gather_RACE_results(result_path, 'high') + +if __name__ == '__main__': + result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/' + gather_finetune_results(result_path) \ No newline at end of file diff --git a/examples/data_efficiency/bert/pile_data_download_preprocess.py b/examples/data_efficiency/bert/pile_data_download_preprocess.py new file mode 100644 index 000000000..1eb34124b --- /dev/null +++ b/examples/data_efficiency/bert/pile_data_download_preprocess.py @@ -0,0 +1,129 @@ +import zstandard +import sys +import time +import os + +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), + os.path.pardir,os.path.pardir,os.path.pardir))) +from megatron.data import indexed_dataset + +def pile_download(download_url, file_path, i): + start = time.time() + zstd_file_path = f"{file_path}{i:02}.jsonl.zst" + download_path = f"{download_url}{i:02}.jsonl.zst" + if not os.path.exists(zstd_file_path): + os.system(f"wget -P {file_path} {download_path}") + print(f"Finished downloading chunk {i} in {time.time() - start} sec") + +def pile_decompress(download_url, file_path, i): + zstd_file_path = f"{file_path}{i:02}.jsonl.zst" + output_path = f"{file_path}{i:02}.jsonl" + if not os.path.exists(output_path): + if not os.path.exists(zstd_file_path): + pile_download(download_url, file_path, i) + start = time.time() + with open(zstd_file_path, 'rb') as compressed: + decomp = zstandard.ZstdDecompressor() + with open(output_path, 'wb') as destination: + decomp.copy_stream(compressed, destination) + os.remove(zstd_file_path) + print(f"Finished decompressing chunk {i} in {time.time() - start} sec") + +def pile_preprocess(download_url, file_path, vocab_file, num_workers, i): + json_file_path = f"{file_path}{i:02}.jsonl" + output_prefix = f"{file_path}pile_bert_train_{i:02}" + if not os.path.exists(f"{output_prefix}_text_sentence.idx"): + if not os.path.exists(json_file_path): + pile_decompress(download_url, file_path, i) + start = time.time() + cmd = f"python ../../tools/preprocess_data.py \ + --input {json_file_path} \ + --output-prefix {output_prefix} \ + --vocab {vocab_file} \ + --dataset-impl mmap \ + --tokenizer-type BertWordPieceLowerCase \ + --split-sentences \ + --workers {num_workers} " + # It's possible to hit MemoryError during above cmd since the memory + # usage is proportional to num_workers. In this case we delete the + # incomplete output and user shall retry with smaller num_workers. + # Our experience show that chunk 6, 7, 9, 17, 18, 20, 21, 24, 27 + # particularly have large memory usage. + if os.system(cmd) == 0: # Success + os.remove(json_file_path) + else: + print(f"Error: chunk {i} preprocessing got error, delete \ + incomplete output. If MemoryError appeared, please retry \ + with num_workers smaller than {num_workers}.") + if os.path.exists(f"{output_prefix}_text_sentence.idx"): + os.remove(f"{output_prefix}_text_sentence.idx") + if os.path.exists(f"{output_prefix}_text_sentence.bin"): + os.remove(f"{output_prefix}_text_sentence.bin") + print(f"Finished preprocessing chunk {i} in {time.time() - start} sec") + +def pile_merge(file_path): + start = time.time() + num_chunks = 30 + vocab_size = 30524 + for i in range(num_chunks): + output_prefix = f"{file_path}pile_bert_train_{i:02}" + assert os.path.exists(f"{output_prefix}_text_sentence.idx") + assert os.path.exists(f"{output_prefix}_text_sentence.bin") + builder = indexed_dataset.make_builder( + f"{file_path}pile_bert_train_text_sentence.bin", impl="mmap", + vocab_size=vocab_size) + for i in range(num_chunks): + chunk_file = f"{file_path}pile_bert_train_{i:02}_text_sentence" + print(f"Merging file {chunk_file}") + builder.merge_file_(chunk_file) + print("Finalizing merged file ...") + builder.finalize(f"{file_path}pile_bert_train_text_sentence.idx") + print(f"Finished merging in {time.time() - start} sec") + # After verifying the merged data with real training, you may want to + # delete the data chunks. + # for i in range(num_chunks): + # output_prefix = f"{file_path}pile_bert_train_{i:02}" + # os.remove(f"{output_prefix}_text_sentence.idx") + # os.remove(f"{output_prefix}_text_sentence.bin") + +if __name__ == '__main__': + # Path to download and store all the output files during the whole process. + # Estimated max storage usage would be around 1.6 TB (or 780GB if skip the + # final merge). Memory usage is proportional to the num_workers below (can + # be as high as O(300GB) if num_workers is around 20). + file_path = "/blob/data/the_pile_bert/" + # The raw Pile data has 30 compressed .zst chunks. To run on single + # machine for all chunks, run "python prepare_pile_data.py range 0 30". + # You can also split and run on multiple machines to speed up, since + # processing one chunk can take hours. The whole process only uses CPU. + if sys.argv[1] == "merge": + # "python prepare_pile_data.py merge" means merge all 30 processed data + # chunks. Run it only after all 30 chunks are preprocessed. The memory + # usage during merge is about 600GB. If you don't have enough memory, + # one solution is to directly use the 30 data chunks as multiple + # datasets. See '--data-path' in + # github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/arguments.py + pile_merge(file_path) + else: + if sys.argv[1] == "range": + # "python prepare_pile_data.py range 0 30" means process chunk 0-29 + selected_chunk = range(int(sys.argv[2]), int(sys.argv[3])) + else: + # "python prepare_pile_data.py 2 5 8" means process chunk 2, 5, 8 + selected_chunk = [int(x) for x in sys.argv[1:]] + print("selected_chunk: ", selected_chunk) + # Number of process. Adjust based on your CPU/Memory. + num_workers = 20 + # Where the raw Pile data can be downloaded. The url may change in + # future. Contact EleutherAI (https://github.com/EleutherAI/the-pile) + # if this url does not work. + download_url = "https://the-eye.eu/public/AI/pile/train/" + vocab_file = "bert-large-uncased-vocab.txt" + vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt" + if not os.path.exists(vocab_file): + os.system(f"wget {vocab_url}") + os.makedirs(file_path, exist_ok=True) + + for i in selected_chunk: + pile_preprocess(download_url, file_path, vocab_file, num_workers, i) diff --git a/examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json b/examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json new file mode 100644 index 000000000..38846c404 --- /dev/null +++ b/examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json @@ -0,0 +1,74 @@ +{ + "train_batch_size": GBSIZE, + "train_micro_batch_size_per_gpu": MBSIZE, + "steps_per_print": LOG_INTERVAL, + + "zero_optimization": { + "stage": ZERO_STAGE, + "elastic_checkpoint": true + }, + + "gradient_clipping": 1.0, + "prescale_gradients": PRESCALE_GRAD, + + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 11 + }, + + "wall_clock_breakdown" : false, + "dataloader_drop_last": true, + "data_efficiency": { + "enabled": true, + "seed": DATA_EFFICIENCY_SEED, + "data_routing": { + "enabled": LTD_ENABLED, + "random_ltd":{ + "enabled": LTD_ENABLED, + "total_layer_num": 24, + "random_ltd_layer_num": 22, + "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], + "model_mask_name": "attention_mask", + "model_type": "encoder", + "hidden_state_order": "seq_batch_dim", + "random_ltd_schedule": { + "min_value": LTD_MIN, + "max_value": LTD_MAX, + "schedule_type":"fixed_linear", + "schedule_config": { + "require_steps": LTD_STEP, + "seq_per_step": 16 + } + } + } + }, + "data_sampling": { + "enabled": CL_ENABLED, + "num_workers": DATA_SAMPLING_NUM_WORKERS, + "curriculum_learning": { + "enabled": CL_ENABLED, + "data_cluster_path": "CL_CLUSTER_PATH", + "curriculum_metrics": { + "CL_1st_METRIC_NAME": { + "index_to_sample_path": "CL_1st_SAMPLE_PATH", + "index_to_metric_path": "CL_1st_METRIC_PATH", + "difficulty_type": "CL_1st_DIFF_TYPE", + "clustering_type": "CL_1st_CLUSTER_TYPE", + "min_difficulty": CL_1st_MIN, + "max_difficulty": CL_1st_MAX, + "schedule_type": "fixed_root", + "schedule_config": { + "total_curriculum_step": CL_1st_TOTAL_STEP, + "difficulty_step": CL_1st_DIFF_STEP, + "root_degree": CL_1st_ROOT + } + } + } + } + } + } +} diff --git a/examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json b/examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json new file mode 100644 index 000000000..2f7268dd3 --- /dev/null +++ b/examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json @@ -0,0 +1,88 @@ +{ + "train_batch_size": GBSIZE, + "train_micro_batch_size_per_gpu": MBSIZE, + "steps_per_print": LOG_INTERVAL, + + "zero_optimization": { + "stage": ZERO_STAGE, + "elastic_checkpoint": true + }, + + "gradient_clipping": 1.0, + "prescale_gradients": PRESCALE_GRAD, + + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 11 + }, + + "wall_clock_breakdown" : false, + "dataloader_drop_last": true, + "data_efficiency": { + "enabled": true, + "seed": DATA_EFFICIENCY_SEED, + "data_routing": { + "enabled": LTD_ENABLED, + "random_ltd":{ + "enabled": LTD_ENABLED, + "total_layer_num": 24, + "random_ltd_layer_num": 22, + "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], + "model_mask_name": "attention_mask", + "model_type": "encoder", + "hidden_state_order": "seq_batch_dim", + "random_ltd_schedule": { + "min_value": LTD_MIN, + "max_value": LTD_MAX, + "schedule_type":"fixed_linear", + "schedule_config": { + "require_steps": LTD_STEP, + "seq_per_step": 16 + } + } + } + }, + "data_sampling": { + "enabled": CL_ENABLED, + "num_workers": DATA_SAMPLING_NUM_WORKERS, + "curriculum_learning": { + "enabled": CL_ENABLED, + "data_cluster_path": "CL_CLUSTER_PATH", + "curriculum_metrics": { + "CL_1st_METRIC_NAME": { + "index_to_sample_path": "CL_1st_SAMPLE_PATH", + "index_to_metric_path": "CL_1st_METRIC_PATH", + "difficulty_type": "CL_1st_DIFF_TYPE", + "clustering_type": "CL_1st_CLUSTER_TYPE", + "min_difficulty": CL_1st_MIN, + "max_difficulty": CL_1st_MAX, + "schedule_type": "fixed_root", + "schedule_config": { + "total_curriculum_step": CL_1st_TOTAL_STEP, + "difficulty_step": CL_1st_DIFF_STEP, + "root_degree": CL_1st_ROOT + } + }, + "CL_2nd_METRIC_NAME": { + "index_to_sample_path": "CL_2nd_SAMPLE_PATH", + "index_to_metric_path": "CL_2nd_METRIC_PATH", + "difficulty_type": "CL_2nd_DIFF_TYPE", + "clustering_type": "CL_2nd_CLUSTER_TYPE", + "min_difficulty": CL_2nd_MIN, + "max_difficulty": CL_2nd_MAX, + "schedule_type": "fixed_root", + "schedule_config": { + "total_curriculum_step": CL_2nd_TOTAL_STEP, + "difficulty_step": CL_2nd_DIFF_STEP, + "root_degree": CL_2nd_ROOT + } + } + } + } + } + } +} diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh new file mode 100644 index 000000000..551ca3118 --- /dev/null +++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh @@ -0,0 +1,472 @@ +#!/bin/bash +dir=`pwd` +############################################################################### +### Main configs +### The main configs are from Megatron-LM paper +### https://arxiv.org/abs/1909.08053. Choose based on your desired model size +### or build your own configs. +seq_len=512 +global_batch_size=1024 +# lr=1e-4 +lr=$1 +min_lr=1e-5 + +## init_std is the standard deviation for weight initialization. Usually larger +## model needs lower std. Here we roughly follow a heuristic equation of +## sqrt(1/3/hidden_size) from https://arxiv.org/pdf/2201.11990.pdf + +## In addition, we find that the 3.9B model (even after tuning init_std) has +## NaN loss issue from the beginning thus unable to train. This is probably +## because in this example we use the public Pile data, which is a more diverse +## (and potentially more noisy) data than what used in Megatron paper. One +## potential solution is only use the sub datasets in Pile that are also +## used by Megatron paper. + +## BERT 110M (same config as original BERT-Base model) +## This config is not included in Megatron-LM paper +# model_size=0.11 +# num_layers=12 +# hidden_size=768 +# num_attn_heads=12 +# init_std=0.02 + +## BERT 336M (same config as original BERT-Large model) +model_size=0.336 +num_layers=24 +hidden_size=1024 +num_attn_heads=16 +init_std=0.02 + +## BERT 1.3B +# model_size=1.3 +# num_layers=24 +# hidden_size=2048 +# num_attn_heads=32 +# init_std=0.013 + +## BERT 3.9B +# model_size=3.9 +# num_layers=48 +# hidden_size=2560 +# num_attn_heads=40 +# init_std=0.011 +############################################################################### +### Training duration configs +## The main termination condition, original Megatron paper trains for 2M iters. +## We changed to token-based termination since data efficiency techniques could +## change token per step. +calc() { awk "BEGIN{ printf \"%.0f\n\", $* }"; } +# train_iters_in_million=2 +train_iters_in_million=$2 +train_tokens=$(calc $train_iters_in_million*1000000*$seq_len*$global_batch_size) +train_tokens_in_billion=$(calc $train_tokens/1000000000) + +## A large enough number of iters, just to make sure we index enough data. The +## only effective termination condition is the train_tokens above. +train_iters=4000000 + +## Another wall-clock time termination condition in minutes. Set it large +## enough to avoid undesired early termination. +exit_duration=30000000 +############################################################################### +### lr configs +## lr warmup and decay duration. Original Megatron paper uses 10000 warmup +## iters. We changed lr decay to token based since data efficiency techniques +## could change token per step. +lr_warmup_iters=10000 +lr_decay_tokens_in_billion=${train_tokens_in_billion} +lr_decay_tokens=${train_tokens} +lr_decay_style="linear" +############################################################################### +### Parallelism configs +## Model parallelism, 1 is no MP +mp_size=1 + +## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true. +## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's +## pipeline parallelism is only integrated with the GPT case, and currently +## DeepSpeed is not integrated with Megatron's own pipeline parallelism. +## Note that currently both curriculum learning and random-LTD are NOT +## compatible with pipeline parallelism. +pp_size=1 +no_pp="true" + +## ZeRO-based data parallelism, stage=0 will disable ZeRO +zero_stage=0 + +## Total number of GPUs. ds_ssh is from DeepSpeed library. +num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2)) +num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) +num_node=$(( ${num_gpus} / ${num_gpus_pernode} )) + +## Data parallel size. +dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} )) + +## Micro batch size per GPU +## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus +## Reduce it manually if GPU OOM +batch_size=$(( ${global_batch_size} / ${dp_size} )) +############################################################################### +### Random layerwise token dropping (random-LTD) configs +## random-LTD's main switch. "false" means disabled. "true" means enabled. +ltd_enabled=${3:-'false'} +## How much dropping ratio to start with. The value denotes the seqlen after +## dropping. +ltd_start=${4:-512} +## How many steps for random-LTD to gradually reduce dropping ratio to zero. +ltd_step_in_million=${5:-1} + +# ltd_enabled="true" +# ltd_start=200 +# ltd_step_in_million=1.8 +ltd_step=$(calc $ltd_step_in_million*1000000) + +## For BERT pretraining, we observe that random-LTD when combined with zero +## dropout can achieve better finetune accuracy on certain tasks. However, this +## is not guaranteed for all models/tasks. It is still recommend to try both +## with and without dropout for random-LTD. +dropout=${6:-0.1} +############################################################################### +### Curriculum learning (CL) configs +## CL's main switch. "false" means disabled. "true" means enabled. +cl_enabled=${7:-'false'} +## Number of CL metrics to use. +cl_num_metric=${8:-1} + +## Name of difficulty metric +cl_1st_metric=${9:-'dummy'} +## Path to the data indexes for this difficulty metric. Samples on ith row of +## index_to_sample have the difficulty value equals to ith row of +## index_to_metric. +cl_1st_index_to_sample_path=${10:-'dummy'} +cl_1st_index_to_metric_path=${11:-'dummy'} +## During training, whether increase difficulty by value- or percentile-based. +cl_1st_difficulty_type=${12:-'value'} +## "single_cluster" means no clustering required and probably CL is achieved by +## data postprocessing. "schedule_based" means will cluster data based on the +## difficulty schedule (pacing function) below. +cl_1st_clustering_type=${13:-'single_cluster'} +## Start difficulty +cl_1st_min=${14:-512} +## End difficulty +cl_1st_max=${15:-512} +## Total step to reach end difficulty +cl_1st_total_step_in_million=${16:-1} +## When changing difficulty, always make sure it's a multiple of the +## difficulty_step below. +cl_1st_difficulty_step=${17:-1} +## Root degree of the schedule (pacing function). +cl_1st_root=${18:-1} + +cl_2nd_metric=${19:-'dummy'} +cl_2nd_index_to_sample_path=${20:-'dummy'} +cl_2nd_index_to_metric_path=${21:-'dummy'} +cl_2nd_difficulty_type=${22:-'value'} +cl_2nd_clustering_type=${23:-'single_cluster'} +cl_2nd_min=${24:-2048} +cl_2nd_max=${25:-2048} +cl_2nd_total_step_in_million=${26:-1} +cl_2nd_difficulty_step=${27:-1} +cl_2nd_root=${28:-1} + +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# ## The *_index_to_sample_percentile_merged is a concatenated index for perf +# ## improvement, but it only works when you set difficulty_type="percentile" in +# ## ds_config. If you use difficulty_type="value", you need to change this to +# ## *_index_to_sample +# # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="value" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=600 +# cl_1st_max=9069 +# cl_1st_total_step_in_million=0.96 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 + +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=128 +# cl_2nd_max=512 +# cl_2nd_total_step_in_million=0.96 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 + +cl_1st_total_step=$(calc $cl_1st_total_step_in_million*1000000) +cl_2nd_total_step=$(calc $cl_2nd_total_step_in_million*1000000) +############################################################################### +### Misc configs +log_interval=100 +eval_iters=10 +eval_interval=1000 +# num_save controls how frequent to save checkpoint. num_save=20 means that a +# checkpoint will be saved every 5% of training. For longer training you would +# want larger num_save to save more frequently, and vice versa. +num_save=100 +estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size})) +save_interval=$((${estimated_train_iter} / ${num_save})) + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" + +## Whether or not log optimizer states (norms, max abs values) to tensorboard. +## This is not required for training and might save GPU memory when turned off. +log_optimizer_state="true" +############################################################################### +### Output and data configs +current_time=$(date "+%Y.%m.%d_%H.%M.%S") +host="${HOSTNAME}" +seed=1234 +## Number of workers for dataloader. We found that for BERT pre-training, +## num_workers will greatly affect data loading time and overall training +## time. In our experiment with 64 GPUs, the performance reaches peak at +## num_workers = 4 but it may differ depending on hardware. Also note that +## larger num_workers add more CPU computation/memory overhead. +num_workers=4 + +## Public the Pile dataset, see ../pile_data_download_preprocess.py about how +## to download and preprocess the data. Change data_home to where you store the +## pile_bert_train_text_sentence.bin and pile_bert_train_text_sentence.idx. +data_home="/vc_data_blob/users/conglli/the_pile_bert" +if [[ "$host" == *"webxt"* ]]; then + data_home="/blob/data/the_pile_bert" +fi +data_path="${data_home}/pile_bert_train_text_sentence" +## train_idx_path forces Megatron to use a specific data index file generated +## when we analyze data. This is needed because our index for curriculum +## learning difficulty metric is based on this data index. +train_idx_path="${data_home}/pile_bert_train_text_sentence_train_indexmap_exact5ep_509msl_0.10ssp_1234s.npy" + +vocab_path="bert-large-uncased-vocab.txt" +if [ ! -f "$vocab_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt +fi + +prescale_grad="true" +jobname="bert_${model_size}B_tok${train_tokens_in_billion}B" +jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_iters}_d${lr_decay_tokens_in_billion}B_${lr_decay_style}" +jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}" +if [[ $zero_stage -gt 0 ]]; then + jobname="${jobname}_z${zero_stage}" + prescale_grad="false" +fi +if [[ $mp_size -gt 1 ]]; then + jobname="${jobname}_mp${mp_size}" +fi +if [ "${no_pp}" = "false" ]; then + jobname="${jobname}_pp${pp_size}" +fi +jobname="${jobname}_seed${seed}" +if [ "${ltd_enabled}" = "true" ]; then + jobname="${jobname}_ltd_${ltd_start}_${ltd_step_in_million}M_drop${dropout}" +fi +if [ "${cl_enabled}" = "true" ]; then + jobname="${jobname}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step_in_million}M_${cl_1st_root}" + if [[ $cl_num_metric -gt 1 ]]; then + jobname="${jobname}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step_in_million}M_${cl_2nd_root}" + fi +fi + +username=$(whoami) +output_home="/blob/users/${username}/project/data_efficient_bert" +log_path="${output_home}/log/" +checkpoint_path="${output_home}/checkpoint/${jobname}" +## Microsoft internal constraint: because tensorboard is logged by last rank, +## it's better to put the path in NFS instead of Blob. +tensorboard_dir="/vc_data/users/${username}/project/data_efficient_bert/tensorboard/" +tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}" +mkdir -p ${log_path} +mkdir -p ${checkpoint_path} +mkdir -p ${tensorboard_path} +if [ "${cl_enabled}" = "true" ]; then + data_cluster_path="${output_home}/data_cluster/${jobname}" + mkdir -p ${data_cluster_path} +fi +############################################################################### +data_options=" \ + --vocab-file ${vocab_path} \ + --data-path ${data_path} \ + --data-impl mmap" + +## If CL is used, make sure to set "--split" the same as what you used during +## offline data analysis&indexing. +megatron_options=" \ + --override-lr-scheduler \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --tensor-model-parallel-size ${mp_size} \ + --init-method-std ${init_std} \ + --lr-decay-tokens ${lr_decay_tokens} \ + --lr-warmup-iters ${lr_warmup_iters} \ + --micro-batch-size ${batch_size} \ + --exit-duration-in-mins ${exit_duration} \ + --global-batch-size ${global_batch_size} \ + --num-layers ${num_layers} \ + --hidden-size ${hidden_size} \ + --num-attention-heads ${num_attn_heads} \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --train-tokens ${train_tokens} \ + --train-iters ${train_iters} \ + --lr ${lr} \ + --min-lr ${min_lr} \ + --lr-decay-style ${lr_decay_style} \ + --split 949,50,1 \ + --log-interval ${log_interval} \ + --eval-interval ${eval_interval} \ + --eval-iters ${eval_iters} \ + --save-interval ${save_interval} \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --num-workers ${num_workers} \ + --fp16 \ + --seed ${seed} \ + --load ${checkpoint_path} \ + --save ${checkpoint_path} \ + --tensorboard-queue-size 1 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + --tensorboard-dir ${tensorboard_path}" + +if [ "${activation_checkpoint}" = "true" ]; then +megatron_options="${megatron_options} \ + --checkpoint-activations" +fi + +if [ "${log_optimizer_state}" = "true" ]; then +megatron_options="${megatron_options} \ + --log-optimizer-states-to-tensorboard" +fi + +if [ "${ltd_enabled}" = "true" ]; then +megatron_options="${megatron_options} \ + --attention-dropout ${dropout} \ + --hidden-dropout ${dropout} \ + --random-ltd" +fi + +if [ "${cl_enabled}" = "true" ]; then +megatron_options="${megatron_options} \ + --train-idx-path ${train_idx_path} \ + --data-efficiency-curriculum-learning" +fi + +config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_seed${seed}" +if [ "${ltd_enabled}" = "true" ]; then + config_json="${config_json}_ltd_${ltd_start}_${ltd_step}" +fi +if [ "${cl_enabled}" = "true" ]; then + config_json="${config_json}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}" + if [[ $cl_num_metric -gt 1 ]]; then + config_json="${config_json}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}" + fi +fi +config_json="${config_json}.json" +if [[ $cl_num_metric -gt 1 ]]; then +template_json="ds_config_bert_2clmetrics_TEMPLATE.json" +sed "s/GBSIZE/${global_batch_size}/" ${template_json} \ + | sed "s/MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/${prescale_grad}/" \ + | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \ + | sed "s/LTD_ENABLED/${ltd_enabled}/" \ + | sed "s/LTD_MIN/${ltd_start}/" \ + | sed "s/LTD_MAX/${seq_len}/" \ + | sed "s/LTD_STEP/${ltd_step}/" \ + | sed "s/CL_ENABLED/${cl_enabled}/" \ + | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \ + | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \ + | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \ + | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \ + | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \ + | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \ + | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \ + | sed "s/CL_1st_MIN/${cl_1st_min}/" \ + | sed "s/CL_1st_MAX/${cl_1st_max}/" \ + | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \ + | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \ + | sed "s/CL_1st_ROOT/${cl_1st_root}/" \ + | sed "s#CL_2nd_METRIC_NAME#${cl_2nd_metric}#" \ + | sed "s#CL_2nd_SAMPLE_PATH#${cl_2nd_index_to_sample_path}#" \ + | sed "s#CL_2nd_METRIC_PATH#${cl_2nd_index_to_metric_path}#" \ + | sed "s#CL_2nd_DIFF_TYPE#${cl_2nd_difficulty_type}#" \ + | sed "s#CL_2nd_CLUSTER_TYPE#${cl_2nd_clustering_type}#" \ + | sed "s/CL_2nd_MIN/${cl_2nd_min}/" \ + | sed "s/CL_2nd_MAX/${cl_2nd_max}/" \ + | sed "s/CL_2nd_TOTAL_STEP/${cl_2nd_total_step}/" \ + | sed "s/CL_2nd_DIFF_STEP/${cl_2nd_difficulty_step}/" \ + | sed "s/CL_2nd_ROOT/${cl_2nd_root}/" \ + > ${config_json} +else +template_json="ds_config_bert_1clmetric_TEMPLATE.json" +sed "s/GBSIZE/${global_batch_size}/" ${template_json} \ + | sed "s/MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/${prescale_grad}/" \ + | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \ + | sed "s/LTD_ENABLED/${ltd_enabled}/" \ + | sed "s/LTD_MIN/${ltd_start}/" \ + | sed "s/LTD_MAX/${seq_len}/" \ + | sed "s/LTD_STEP/${ltd_step}/" \ + | sed "s/CL_ENABLED/${cl_enabled}/" \ + | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \ + | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \ + | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \ + | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \ + | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \ + | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \ + | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \ + | sed "s/CL_1st_MIN/${cl_1st_min}/" \ + | sed "s/CL_1st_MAX/${cl_1st_max}/" \ + | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \ + | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \ + | sed "s/CL_1st_ROOT/${cl_1st_root}/" \ + > ${config_json} +fi + +deepspeed_options=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${zero_stage} \ + --pipeline-model-parallel-size ${pp_size}" + +if [[ "${no_pp}" = "true" ]]; then +deepspeed_options="${deepspeed_options} \ + --no-pipeline-parallel" +fi + +if [ "${activation_checkpoint}" = "true" ]; then +deepspeed_options="${deepspeed_options} \ + --deepspeed-activation-checkpointing" +fi + +## When saving checkpoint to a storage with cache, their could be consistency +## issue of the pointer to latest checkpoint. Here we find the correct pointer +## and broadcast it to all nodes. +iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt" +iteration_file_2="$checkpoint_path/latest" +iteration=0 +for (( node = 0; node <= num_node-1; node++ )) +do + if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then + local_iteration=$(ssh -q worker-"$node" cat $iteration_file) + iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} : ${iteration} )) + fi +done +if [[ $iteration -gt 0 ]]; then + iteration_2="global_step${iteration}" + ds_ssh "echo $iteration > $iteration_file" + ds_ssh "echo $iteration_2 > $iteration_file_2" +fi + +deepspeed ${dir}/../../../../pretrain_bert.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log \ No newline at end of file diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh new file mode 100644 index 000000000..46c6c48b5 --- /dev/null +++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh @@ -0,0 +1,241 @@ +############################################################################### +### Each block below is one pretraining setup. Uncomment one block to try. +############################################################################### +### Baseline cases, mostly based on Megatron-LM's BERT-Large hyperparameters, +### but with some changes (different LR schedule). +## Baseline 1049B tokens (100%): +# lr=1e-4 +# train_iters_in_million=2 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} +############################################################################### +### Curriculum learning (CL) + Random layerwise token dropping (random-LTD). +### Due to resource constraints, we did not finish training any model with this +### setup. This example is just to demonstrate that CL+random-LTD can run for +### BERT pretraining. +## CL+random-LTD 1049B tokens (100%): +# lr=1e-4 +# train_iters_in_million=2 +# ltd_enabled="true" +# ltd_start=200 +# ltd_step_in_million=1.8 +# dropout=0 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="value" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=600 +# cl_1st_max=9069 +# cl_1st_total_step_in_million=0.96 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=128 +# cl_2nd_max=512 +# cl_2nd_total_step_in_million=0.96 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} \ +############################################################################### +### Random layerwise token dropping (random-LTD). +## random-LTD 723B tokens (69%): +# lr=1.45e-4 +# train_iters_in_million=1.38 +# ltd_enabled="true" +# ltd_start=200 +# ltd_step_in_million=1.8 +# dropout=0 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} +############################################################################### +### Curriculum learning (CL). +## CL vocab rarity 734B tokens (70%): +# lr=1.4e-4 +# train_iters_in_million=1.4 +# ltd_enabled="false" +# ltd_start=512 +# ltd_step_in_million=1 +# dropout=0.1 +# cl_enabled="true" +# cl_num_metric=1 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="value" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=600 +# cl_1st_max=9069 +# cl_1st_total_step_in_million=0.7 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} +############################################################################### +## CL vocab rarity + seqlen truncation 1049B tokens (100%): +# lr=1e-4 +# train_iters_in_million=2 +# ltd_enabled="false" +# ltd_start=512 +# ltd_step_in_million=1 +# dropout=0.1 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="value" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=600 +# cl_1st_max=9069 +# cl_1st_total_step_in_million=0.96 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=128 +# cl_2nd_max=512 +# cl_2nd_total_step_in_million=0.96 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +## CL vocab rarity + seqlen reorder 1049B tokens (100%): +# lr=1e-4 +# train_iters_in_million=2 +# ltd_enabled="false" +# ltd_start=512 +# ltd_step_in_million=1 +# dropout=0.1 +# cl_enabled="true" +# cl_num_metric=1 +# cl_1st_metric="seqlenvocabrarity" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen_vocab_rarity/seqlen_vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen_vocab_rarity/seqlen_vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=5 +# cl_1st_max=100 +# cl_1st_total_step_in_million=0.96 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} +############################################################################### +## CL vocab rarity 1049B tokens (100%): +# lr=1e-4 +# train_iters_in_million=2 +# ltd_enabled="false" +# ltd_start=512 +# ltd_step_in_million=1 +# dropout=0.1 +# cl_enabled="true" +# cl_num_metric=1 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="value" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=600 +# cl_1st_max=9069 +# cl_1st_total_step_in_million=0.96 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} +############################################################################### +## CL seqlen truncation 1049B tokens (100%): +# lr=1e-4 +# train_iters_in_million=2 +# ltd_enabled="false" +# ltd_start=512 +# ltd_step_in_million=1 +# dropout=0.1 +# cl_enabled="true" +# cl_num_metric=1 +# cl_1st_metric="seqlen_truncate" +# cl_1st_index_to_sample_path="dummy" +# cl_1st_index_to_metric_path="dummy" +# cl_1st_difficulty_type="value" +# cl_1st_clustering_type="single_cluster" +# cl_1st_min=128 +# cl_1st_max=512 +# cl_1st_total_step_in_million=0.96 +# cl_1st_difficulty_step=8 +# cl_1st_root=1 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} +############################################################################### +## CL seqlen reorder 1049B tokens (100%): +# lr=1e-4 +# train_iters_in_million=2 +# ltd_enabled="false" +# ltd_start=512 +# ltd_step_in_million=1 +# dropout=0.1 +# cl_enabled="true" +# cl_num_metric=1 +# cl_1st_metric="seqlen" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen/seqlen_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen/seqlen_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="single_cluster" +# cl_1st_min=5 +# cl_1st_max=100 +# cl_1st_total_step_in_million=0.96 +# cl_1st_difficulty_step=8 +# cl_1st_root=2 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} +############################################################################### \ No newline at end of file diff --git a/examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh b/examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh new file mode 100644 index 000000000..3b1caf06f --- /dev/null +++ b/examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +num_workers=1 # Num nodes to run the map job +num_threads=40 # Num threads on each node. Set this based on #CPU cores + +# If different data epochs have slightly different data samples (e.g., due +# to randomness), then you need to specify large enough num_epochs that cover +# whole pretraining. If different data epochs are the same, set num_epochs to +# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency +# library will automatically handle reshuffling when reaching another epoch. +num_epochs=1 + +# Which node is this node (start with 0 and end with num_workers-1). This +# script only launch the map job on 1 worker node, since we don't expect +# running on many nodes and workers don't need any communication. But you +# can modify this script to add a MPI/torch distributed launcher. +worker_id=$1 +save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/" + +metric='total_vocab_freq' +# metric='vocab_rarity' # this requires the result of total_vocab_freq + +seq_len=2048 +batch_size=10000 + +jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}" +# Public the Pile dataset, can be downloaded at +# https://mystic.the-eye.eu/public/AI/pile_neox/ +## Change data_home to your own training data path. +# data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing" +data_home="/blob/data/the_pile_public_merged_nopreprocessing" +data_path="${data_home}/pile_text_document" + +vocab_path="gpt2-vocab.json" +if [ ! -f "$vocab_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json +fi +merge_path="gpt2-merges.txt" +if [ ! -f "$merge_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt +fi + +# Make sure the "--split" is the same as what you will use for pre-training. +options=" \ + --analyzing-task map \ + --analyzing-data-type GPT \ + --analyzing-metric ${metric} \ + --analyzing-num-workers ${num_workers} \ + --analyzing-worker-id ${worker_id} \ + --analyzing-num-threads ${num_threads} \ + --vocab-file ${vocab_path} \ + --merge-file ${merge_path} \ + --data-path ${data_path} \ + --data-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --micro-batch-size ${batch_size} \ + --global-batch-size ${batch_size} \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --num-layers 1 \ + --hidden-size 1 \ + --num-attention-heads 1 \ + --split 949,50,1 \ + --distributed-backend gloo \ + --train-data-exact-num-epochs ${num_epochs} \ + --return-data-index \ + --save-interval 1 \ + --save ${save_path}" + +python ../analyze_data.py ${options} &> ${jobname}.log \ No newline at end of file diff --git a/examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh b/examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh new file mode 100644 index 000000000..a1242ea94 --- /dev/null +++ b/examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Set these 2 to the same as what you used during map job. We need these 2 +# configs to know how many map job result files do we have. +num_workers=1 +num_threads=40 +# Reduce job only has 1 worker but can accelerate by multithreading. +num_threads_reduce=40 + +# If different data epochs have slightly different data samples (e.g., due +# to randomness), then you need to specify large enough num_epochs that cover +# whole pretraining. If different data epochs are the same, set num_epochs to +# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency +# library will automatically handle reshuffling when reaching another epoch. +num_epochs=1 + +save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/" + +metric='total_vocab_freq' +# metric='vocab_rarity' # this requires the result of total_vocab_freq + +seq_len=2048 +batch_size=10000 + +jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-reduce" +# Public the Pile dataset, can be downloaded at +# https://mystic.the-eye.eu/public/AI/pile_neox/ +## Change data_home to your own training data path. +# data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing" +data_home="/blob/data/the_pile_public_merged_nopreprocessing" +data_path="${data_home}/pile_text_document" + +vocab_path="gpt2-vocab.json" +if [ ! -f "$vocab_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json +fi +merge_path="gpt2-merges.txt" +if [ ! -f "$merge_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt +fi + +# Make sure the "--split" is the same as what you will use for pre-training. +options=" \ + --analyzing-task reduce \ + --analyzing-data-type GPT \ + --analyzing-metric ${metric} \ + --analyzing-num-workers ${num_workers} \ + --analyzing-num-threads ${num_threads} \ + --analyzing-num-threads-reduce ${num_threads_reduce} \ + --vocab-file ${vocab_path} \ + --merge-file ${merge_path} \ + --data-path ${data_path} \ + --data-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --micro-batch-size ${batch_size} \ + --global-batch-size ${batch_size} \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --num-layers 1 \ + --hidden-size 1 \ + --num-attention-heads 1 \ + --split 949,50,1 \ + --distributed-backend gloo \ + --train-data-exact-num-epochs ${num_epochs} \ + --return-data-index \ + --save-interval 1 \ + --save ${save_path}" + +python ../analyze_data.py ${options} &> ${jobname}.log \ No newline at end of file diff --git a/examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json b/examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json new file mode 100644 index 000000000..09b276c88 --- /dev/null +++ b/examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json @@ -0,0 +1,28 @@ +{ +"train_batch_size" : 2048, +"train_micro_batch_size_per_gpu": 16, +"steps_per_print": 10, + +"zero_optimization": { + "stage": 0, + "elastic_checkpoint": true +}, + +"gradient_clipping": 1.0, +"prescale_gradients": true, + +"fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 11 +}, + +"bf16": { + "enabled": false +}, + +"wall_clock_breakdown" : false +} \ No newline at end of file diff --git a/examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh b/examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh new file mode 100644 index 000000000..4c16e608c --- /dev/null +++ b/examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh @@ -0,0 +1,77 @@ +## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md +## and follow the steps of installation/data downloading. + +## Code below only works when you run each evalharness task on a single GPU. +## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh +checkpoint_path=$1 +config_path=$2 +result_path=$3 +rank=$4 +tasks=$5 +hostname=$6 +master_port=$(( 12345 + ${rank} )) +batch_size=$7 +num_fewshot=$8 + +mp_size=1 +pp_size=1 +no_pp="true" +ep_size=1 + +vocab_file="gpt2-vocab.json" +if [ ! -f "$vocab_file" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json +fi +merge_file="gpt2-merges.txt" +if [ ! -f "$merge_file" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt +fi + +export HF_DATASETS_OFFLINE=1 + +dir2=$(dirname "$checkpoint_path") +dirname=$(basename "$dir2")/$(basename "$checkpoint_path") +result_path="${result_path}/${dirname}" +mkdir -p $result_path +result_file="${result_path}/${tasks}_${num_fewshot}shot.json" + +# Dummy arguments to make megatron happy. No need to configure them. +# The reason we don't need to configure them and many other arguments is +# because the eval framework will read the arguments from checkpoint file. +megatron_required_args="\ + --num-layers -1 \ + --hidden-size -1 \ + --num-attention-heads -1 \ + --seq-length -1 \ + --max-position-embeddings -1 +" + +command="../../../../tasks/eval_harness/evaluate.py \ + --load ${checkpoint_path} \ + --tensor-model-parallel-size ${mp_size} \ + --pipeline-model-parallel-size ${pp_size} \ + --moe-expert-parallel-size ${ep_size} \ + --vocab-file ${vocab_file} \ + --merge-file ${merge_file} \ + --micro-batch-size ${batch_size} \ + --no-load-optim \ + --no-load-rng \ + --inference \ + --disable-moe-token-dropping \ + --adaptive_seq_len \ + --eval_fp32 \ + --num_fewshot ${num_fewshot} \ + --task_list ${tasks} \ + --results_path ${result_file} \ + --deepspeed \ + --deepspeed_config ${config_path} \ + ${megatron_required_args} \ + " + +if [[ "${no_pp}" = "true" ]]; then +command="${command} \ + --no-pipeline-parallel" +fi + +launcher="deepspeed --include=$hostname:$rank --master_port=${master_port}" +$launcher $command &> "${result_path}/${tasks}_${num_fewshot}shot.log" \ No newline at end of file diff --git a/examples/data_efficiency/gpt/eval/ds_evalharness_gather_result.py b/examples/data_efficiency/gpt/eval/ds_evalharness_gather_result.py new file mode 100644 index 000000000..e0c0c332c --- /dev/null +++ b/examples/data_efficiency/gpt/eval/ds_evalharness_gather_result.py @@ -0,0 +1,358 @@ +import json +import os +import math +from math import log10, floor +import copy + +def mean(arr): + return sum(arr) / len(arr) + + +def pop_stddev(arr): + mu = mean(arr) + return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr)) + + +def sample_stddev(arr): + mu = mean(arr) + return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1)) + + +def mean_stderr(arr): + return sample_stddev(arr) / math.sqrt(len(arr)) + + +def median(arr): + return arr[len(arr) // 2] + +metric_dict = { + "hellaswag":"acc_norm", + "lambada":"acc", + "triviaqa":"acc", + "webqs":"acc", + "winogrande":"acc", + "piqa":"acc_norm", + "arc_challenge":"acc_norm", + "arc_easy":"acc_norm", + "openbookqa":"acc_norm", + "race":"acc", + "boolq":"acc", + "cb":"acc", + "copa":"acc", + "rte":"acc", + "wic":"acc", + "wsc":"acc", + "multirc":"acc", + "record":"f1", + "anli_r1":"acc", + "anli_r2":"acc", + "anli_r3":"acc", + "wikitext":"word_perplexity", + "logiqa":"acc_norm", + "mathqa":"acc_norm", + "mc_taco":"f1", + "mrpc":"acc", + "prost":"acc_norm", + "pubmedqa":"acc", + "qnli":"acc", + "qqp":"acc", + "sciq":"acc_norm", + "sst":"acc", + "wnli":"acc" +} + +official_dict = { + "hellaswag":["HellaSwag","acc"], + "lambada":["LAMBADA","acc"], + "triviaqa":["TriviaQA","acc"], + "webqs":["WebQs","acc"], + "winogrande":["Winogrande","acc"], + "piqa":["PIQA","acc"], + "arc_challenge":["ARC Challenge","acc"], + "arc_easy":["ARC Easy","acc"], + "openbookqa":["OpenBookQA","acc"], + "race":["RACE-h","acc"], + "boolq":["BoolQ","acc"], + "cb":["CB","acc"], + "copa":["Copa","acc"], + "rte":["RTE","acc"], + "wic":["WiC","acc"], + "wsc":["WSC","acc"], + "multirc":["MultiRC","acc"], + "record":["ReCoRD","f1"], + "anli_r1":["ANLI R1","acc"], + "anli_r2":["ANLI R2","acc"], + "anli_r3":["ANLI R3","acc"], + "wikitext":["WikiText-2","ppl"], + "logiqa":["LogiQA","acc"], + "mathqa":["MathQA","acc"], + "mc_taco":["MC-TACO","f1"], + "mrpc":["MRPC","acc"], + "prost":["PROST","acc"], + "pubmedqa":["PubMedQA","acc"], + "qnli":["QNLI","acc"], + "qqp":["QQP","acc"], + "sciq":["SciQ","acc"], + "sst":["SST-2","acc"], + "wnli":["WNLI","acc"] +} + +# When comparing with gpt3 paper, the most trustful tasks are the hellaswag to +# anli_r3, who have >= 1000 samples (less variation), and have <= 43% data +# contamination in the paper. +gpt3paper_zeroshoteval = { + "hellaswag":[33.7,43.6,51.0,54.7,62.8,67.4,70.9,78.9], + "lambada":[42.7,54.3,60.4,63.6,67.1,70.3,72.5,76.2], + "triviaqa":[4.15,7.61,14.0,19.7,31.3,38.7,41.8,64.3], + "webqs":[1.77,3.20,4.33,4.63,7.92,7.73,8.22,14.4], + "winogrande":[52.0,52.1,57.4,58.7,62.3,64.5,67.9,70.2], + "piqa":[64.6,70.2,72.9,75.1,75.6,78.0,78.5,81.0], + "arc_challenge":[26.6,29.5,31.8,35.5,38.0,41.4,43.7,51.4], + "arc_easy":[43.6,46.5,53.0,53.8,58.2,60.2,63.8,68.8], + "anli_r1":[33.4,34.2,33.4,33.4,34.2,32.3,33.2,34.6], + "anli_r2":[33.2,31.9,33.3,33.3,33.8,33.5,33.5,35.4], + "anli_r3":[33.6,34.0,33.8,33.4,35.3,34.8,34.4,34.5], + "openbookqa":[35.6,43.2,45.2,46.8,53.0,50.4,55.6,57.6], + "race":[35.2,37.9,40.1,40.9,42.4,44.1,44.6,45.5], + "boolq":[49.7,60.3,58.9,62.4,67.1,65.4,66.2,60.5], + "cb":[0.00,32.1,8.93,19.6,19.6,28.6,19.6,46.4], + "copa":[66.0,68.0,73.0,77.0,76.0,80.0,84.0,91.0], + "rte":[47.7,49.8,48.4,56.0,46.6,55.2,62.8,63.5], + "wic":[0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00], + "wsc":[59.6,56.7,65.4,61.5,66.3,60.6,64.4,65.4], + "multirc":[4.72,9.65,12.3,13.6,14.3,18.4,24.2,27.6], + "record":[71.9,79.2,82.8,85.2,87.3,89.5,90.4,91.0] +} + +gpt3paper_fewshoteval = { + "hellaswag":[33.5,43.1,51.3,54.9,62.9,67.3,71.3,79.3], + "lambada":[22.0,40.4,63.2,57.0,78.1,79.1,81.3,86.4], + "triviaqa":[6.96,16.3,26.5,32.1,42.3,51.6,57.5,71.2], + "webqs":[5.46,12.6,15.9,19.6,24.8,27.7,33.5,41.5], + "winogrande":[51.3,52.6,57.5,59.1,62.6,67.4,70.0,77.7], + "piqa":[64.3,69.4,72.0,74.3,75.4,77.8,79.9,82.3], + "arc_challenge":[25.5,28.4,32.3,36.7,39.5,43.7,44.8,51.5], + "arc_easy":[42.7,51.0,58.1,59.1,62.1,65.8,69.1,70.1], + "anli_r1":[32.1,32.5,30.9,32.5,33.5,33.1,33.3,36.8], + "anli_r2":[35.7,33.8,32.1,31.4,32.6,33.3,32.6,34.0], + "anli_r3":[35.0,34.4,35.1,36.0,32.7,33.9,34.5,40.2], + "openbookqa":[37.0,43.6,48.0,50.6,55.6,55.2,60.8,65.4], + "race":[34.3,37.0,40.4,41.4,42.3,44.7,45.1,46.8], + "boolq":[43.1,60.6,62.0,64.1,70.3,70.0,70.2,77.5], + "cb":[42.9,58.9,53.6,69.6,67.9,60.7,66.1,82.1], + "copa":[67.0,64.0,72.0,77.0,83.0,83.0,86.0,92.0], + "rte":[52.3,48.4,46.9,50.9,56.3,49.5,60.6,72.9], + "wic":[49.8,55.0,53.0,53.0,51.6,53.1,51.1,55.3], + "wsc":[58.7,60.6,54.8,49.0,62.5,67.3,75.0,75.0], + "multirc":[6.09,11.8,16.8,20.8,24.7,23.8,25.0,32.5], + "record":[70.7,77.9,82.1,84.0,87.5,88.8,89.8,90.1] +} + +gpt3paper_zeroshoteval_index = { + "125M":0, # Small + "350M":1, # Medium + "760M":2, # Large + "1.3B":3, # XL + "2.7B":4, + "6.7B":5, + "13B":6, + "175B":7 +} + +def round_sig(x, sig=3): + if x == 0: + return 0 + return round(x, sig-int(floor(log10(abs(x))))-1) + +def generate_result_table(tab_header, configs, task_order, caption, avg_range, + avg_tag, avg_only=False, fontsize="\\footnotesize", find_best=False, + candidate_range=None, candidate_task=None, split_name_by_space=False, + print_stderr=False, few_shot=False): + # Gather results + result_list = [] + for i in range(len(configs)): + result_dict = {} + eval_path = configs[i][-1] + if "paper" in configs[i][0]: + assert eval_path is None + if eval_path is None: + assert "paper" in configs[i][0] + assert configs[i][1] in gpt3paper_zeroshoteval_index, "the second element has to be the model size" + paper_result_idx = gpt3paper_zeroshoteval_index[configs[i][1]] + if few_shot: + for task in gpt3paper_fewshoteval: + result_dict[task] = [gpt3paper_fewshoteval[task][paper_result_idx]] + else: + for task in gpt3paper_zeroshoteval: + result_dict[task] = [gpt3paper_zeroshoteval[task][paper_result_idx]] + else: + for file in os.listdir(eval_path): + if file.endswith(".json"): + result = json.load(open(eval_path+"/"+file, "r")) + for task in result['results']: + if task != "wikitext": + result_dict[task] = [100.0*result['results'][task][metric_dict[task]]] + else: + result_dict[task] = [result['results'][task][metric_dict[task]]] + result_list.append(result_dict) + avg_list = [] + for i in range(len(configs)): + average_results = [] + for j in range(len(avg_range)): + results = [] + for k in range(avg_range[j]+1): + if task_order[k] in result_list[i]: + results.append(result_list[i][task_order[k]][0]) + if len(results) > 0: + average_results.append(float(sum(results))/len(results)) + else: + average_results.append(0) + avg_list.append(average_results) + + if find_best: + best_avg_value = [0 for _ in range(len(avg_range))] + best_avg_idx = [0 for _ in range(len(avg_range))] + best_task_value = [0 for _ in range(len(candidate_task))] + best_task_idx = [0 for _ in range(len(candidate_task))] + for i in range(candidate_range, len(configs)): + for j in range(len(avg_range)): + if avg_list[i][j] > best_avg_value[j]: + best_avg_value[j] = avg_list[i][j] + best_avg_idx[j] = i + for j in range(len(candidate_task)): + if result_list[i][candidate_task[j]] > best_task_value[j]: + best_task_value[j] = result_list[i][candidate_task[j]] + best_task_idx[j] = i + # reorder configs, result_list, avg_list to only keep the best cases + new_configs = configs[:candidate_range] + new_result_list = result_list[:candidate_range] + new_avg_list = avg_list[:candidate_range] + for i in range(len(avg_range)): + selected_config = copy.deepcopy(configs[best_avg_idx[i]]) + selected_config[0] = "({})Best Avg{}".format(len(new_configs), + avg_tag[i]) + new_configs.append(selected_config) + new_result_list.append(result_list[best_avg_idx[i]]) + new_avg_list.append(avg_list[best_avg_idx[i]]) + + for i in range(len(candidate_task)): + selected_config = copy.deepcopy(configs[best_task_idx[i]]) + selected_config[0] = "({})Best {}".format(len(new_configs), + official_dict[candidate_task[i]][0]) + new_configs.append(selected_config) + new_result_list.append(result_list[best_task_idx[i]]) + new_avg_list.append(avg_list[best_task_idx[i]]) + configs = new_configs + result_list = new_result_list + avg_list = new_avg_list + + # split the case names by space + if split_name_by_space: + max_num_row = 1 + splitted_names = [] + for i in range(len(configs)): + new_name = configs[i][0].split() + max_num_row = max(max_num_row, len(new_name)) + splitted_names.append(new_name) + tab_header = ["" for _ in range(max_num_row-1)] + tab_header + for i in range(len(configs)): + padding = ["" for _ in range(max_num_row-len(splitted_names[i]))] + configs[i] = padding + splitted_names[i] + configs[i][1:] + + # generate the table + print("\\begin{table}") + print("\centering") + print(fontsize) + print("\caption{"+caption+"}") + text = "\\begin{tabular}{@{}l|" + for _ in range(len(configs)): + text += "c" + text += "@{}}" + print(text) + print("\\toprule") + for i in range(len(tab_header)): + text = "{} &".format(tab_header[i]) + for j in range(len(configs)): + if j != len(configs) - 1: + text += (configs[j][i] + "& ") + else: + text += (configs[j][i] + "\\\\") + print(text) + print("\midrule") + for i in range(len(avg_range)): + text = ("Avg. " + avg_tag[i]) + arr = [] + for j in range(len(configs)): + arr.append(avg_list[j][i]) + text += " & {}".format(round_sig(avg_list[j][i])) + text += "\\\\" + if print_stderr: + arr_mean = mean(arr) + arr_std = sample_stddev(arr) + text += " % mean {:.3f}, std {:.3f}, mean+1std {:.3f}, mean+2std {:.3f}, mean+3std {:.3f}".format( + arr_mean, arr_std, arr_mean+arr_std, arr_mean+arr_std*2, arr_mean+arr_std*3) + print(text) + if not avg_only: + print("\midrule") + for i in range(len(task_order)): + task = task_order[i] + text = "({}) {}".format(i, official_dict[task][0]) + arr = [] + for j in range(len(configs)): + result_dict = result_list[j] + if task in result_dict: + text += " & {}".format(round_sig(result_dict[task][0])) + arr.append(result_dict[task][0]) + else: + text += " & N/A" + text += "\\\\" + if print_stderr: + arr_mean = mean(arr) + arr_std = sample_stddev(arr) + if task != "wikitext": + text += " % mean {:.3f}, std {:.3f}, mean+1std {:.3f}, mean+2std {:.3f}, mean+3std {:.3f}".format( + arr_mean, arr_std, arr_mean+arr_std, arr_mean+arr_std*2, arr_mean+arr_std*3) + else: + text += " % mean {:.3f}, std {:.3f}, mean-1std {:.3f}, mean-2std {:.3f}, mean-3std {:.3f}".format( + arr_mean, arr_std, arr_mean-arr_std, arr_mean-arr_std*2, arr_mean-arr_std*3) + print(text) + print("\\bottomrule") + print("\end{tabular}") + print("\end{table}") + print("") + print("") + +if __name__ == '__main__': + task_order = ["hellaswag","lambada","triviaqa","webqs","winogrande","piqa", + "arc_challenge","arc_easy","anli_r1","anli_r2","anli_r3","openbookqa", + "race","boolq","copa","rte","wsc","multirc","record","wikitext"] + avg_range = [18] + avg_tag = ["0-18"] + tab_header = ["Case","Model size","Train tokens","Batch size","Bsz warmup","LR","min LR","LR warmup","LR decay","decay style"] + + configs = [ + ["(0)paper","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine", None], # gpt3 paper orig results, thus result path is None + ["(1)repro","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine", + '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/'], + ["(2)fixedBsz","125M","300B","256","N/A","6e-4","6e-5","3000M","260B","cosine", + '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'], + ["(3)fixedBsz 300B+minLR","125M","300B","256","N/A","6e-4","1e-6","3000M","300B","cosine", + '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min1.0e-6-wup3000M-dcy300B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'] + ] + caption = 'Conglong: GPT-3 125M results zero-shot' + generate_result_table(tab_header, configs, task_order, caption, avg_range, + avg_tag, split_name_by_space=True, fontsize="\\tiny") + + configs = [ + ["(0)paper","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine", None], # gpt3 paper orig results, thus result path is None + ["(1)repro","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine", + '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/'], + ["(2)fixedBsz","125M","300B","256","N/A","6e-4","6e-5","3000M","260B","cosine", + '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'], + ["(3)fixedBsz 300B+minLR","125M","300B","256","N/A","6e-4","1e-6","3000M","300B","cosine", + '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min1.0e-6-wup3000M-dcy300B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'], + ] + caption = 'Conglong: GPT-3 125M results few-shot' + generate_result_table(tab_header, configs, task_order, caption, avg_range, + avg_tag, split_name_by_space=True, fontsize="\\tiny", few_shot=True) + diff --git a/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh b/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh new file mode 100644 index 000000000..b14622a32 --- /dev/null +++ b/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh @@ -0,0 +1,66 @@ +## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md +## and follow the steps of installation/data downloading. +checkpoint_paths=( + /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/ + /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/ +) + +## No need to use the exact training config json, just use this dummy is fine +config_path=ds_config_eval_dummy.json +username=$(whoami) +result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results" + +## Task(s) on the same row will be performed together in the same process. +## There exist other tasks that can run but we skip because they didn't appear +## or have strange scores in GPT-3 paper: qqp, prost, cb, wic, mrpc, sst, wnli +## pubmedqa, logiqa, qnli, sciq, mc_taco, mathqa. For wikitext, it didn't +## appear in paper but we include it for a perplexity task. +tasks=( + record + triviaqa + hellaswag + arc_challenge + arc_easy + race + multirc + openbookqa + lambada + webqs + winogrande + piqa + anli_r1,anli_r2,anli_r3 + boolq,copa + rte,wsc + wikitext +) + +## Use localhost if you didn't setup hostfile as described in +## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node. +## If hostfile exist, use hostname (e.g., worker-0) in hostfile. +# hostname="localhost" +hostname="worker-0" + +batch_size=32 + +## This script is for zero-shot +num_fewshot=0 + +num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) +cuda_id=-1 +total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+) + +## Code below only works when you run each evalharness task on a single GPU. +## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh +for l in "${!checkpoint_paths[@]}"; do + checkpoint_path=${checkpoint_paths[l]} + for ((i=0;i<${#tasks[@]};++i)); do + task=${tasks[i]} + free_mem=0 + while [ $free_mem -lt $total_mem ]; do + cuda_id=$(((cuda_id+1)%num_gpus)) + free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+) + sleep 60s + done + bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot & + done +done diff --git a/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh b/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh new file mode 100644 index 000000000..208de033f --- /dev/null +++ b/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh @@ -0,0 +1,61 @@ +## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md +## and follow the steps of installation/data downloading. +checkpoint_paths=( + /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/ + /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/ +) + +## No need to use the exact training config json, just use this dummy is fine +config_path=ds_config_eval_dummy.json +username=$(whoami) +result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results_10shot" + +## Task(s) on the same row will be performed together in the same process. +tasks=( + record + triviaqa + hellaswag + arc_challenge + arc_easy + race + multirc + openbookqa + lambada + webqs + winogrande + piqa + anli_r1,anli_r2 + anli_r3 + boolq,copa + rte,wsc +) + +num_fewshot=10 + +## Use localhost if you didn't setup hostfile as described in +## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node. +## If hostfile exist, use hostname (e.g., worker-0) in hostfile. +# hostname="localhost" +hostname="worker-0" + +batch_size=16 + +num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) +cuda_id=-1 +total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+) + +## Code below only works when you run each evalharness task on a single GPU. +## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh +for l in "${!checkpoint_paths[@]}"; do + checkpoint_path=${checkpoint_paths[l]} + for ((i=0;i<${#tasks[@]};++i)); do + task=${tasks[i]} + free_mem=0 + while [ $free_mem -lt $total_mem ]; do + cuda_id=$(((cuda_id+1)%num_gpus)) + free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+) + sleep 60s + done + bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot & + done +done diff --git a/examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json b/examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json new file mode 100644 index 000000000..a9e3d6116 --- /dev/null +++ b/examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json @@ -0,0 +1,74 @@ +{ + "train_batch_size": GBSIZE, + "train_micro_batch_size_per_gpu": MBSIZE, + "steps_per_print": LOG_INTERVAL, + + "zero_optimization": { + "stage": ZERO_STAGE, + "elastic_checkpoint": true + }, + + "gradient_clipping": 1.0, + "prescale_gradients": PRESCALE_GRAD, + + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 11 + }, + + "wall_clock_breakdown" : false, + "dataloader_drop_last": true, + "data_efficiency": { + "enabled": true, + "seed": DATA_EFFICIENCY_SEED, + "data_routing": { + "enabled": LTD_ENABLED, + "random_ltd":{ + "enabled": LTD_ENABLED, + "total_layer_num": 24, + "random_ltd_layer_num": 22, + "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], + "model_mask_name": "attention_mask", + "model_type": "decoder", + "hidden_state_order": "seq_batch_dim", + "random_ltd_schedule": { + "min_value": LTD_MIN, + "max_value": LTD_MAX, + "schedule_type":"fixed_linear", + "schedule_config": { + "require_steps": LTD_STEP, + "seq_per_step": 16 + } + } + } + }, + "data_sampling": { + "enabled": CL_ENABLED, + "num_workers": DATA_SAMPLING_NUM_WORKERS, + "curriculum_learning": { + "enabled": CL_ENABLED, + "data_cluster_path": "CL_CLUSTER_PATH", + "curriculum_metrics": { + "CL_1st_METRIC_NAME": { + "index_to_sample_path": "CL_1st_SAMPLE_PATH", + "index_to_metric_path": "CL_1st_METRIC_PATH", + "difficulty_type": "CL_1st_DIFF_TYPE", + "clustering_type": "CL_1st_CLUSTER_TYPE", + "min_difficulty": CL_1st_MIN, + "max_difficulty": CL_1st_MAX, + "schedule_type": "fixed_root", + "schedule_config": { + "total_curriculum_step": CL_1st_TOTAL_STEP, + "difficulty_step": CL_1st_DIFF_STEP, + "root_degree": CL_1st_ROOT + } + } + } + } + } + } +} diff --git a/examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json b/examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json new file mode 100644 index 000000000..3209f34b0 --- /dev/null +++ b/examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json @@ -0,0 +1,88 @@ +{ + "train_batch_size": GBSIZE, + "train_micro_batch_size_per_gpu": MBSIZE, + "steps_per_print": LOG_INTERVAL, + + "zero_optimization": { + "stage": ZERO_STAGE, + "elastic_checkpoint": true + }, + + "gradient_clipping": 1.0, + "prescale_gradients": PRESCALE_GRAD, + + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 11 + }, + + "wall_clock_breakdown" : false, + "dataloader_drop_last": true, + "data_efficiency": { + "enabled": true, + "seed": DATA_EFFICIENCY_SEED, + "data_routing": { + "enabled": LTD_ENABLED, + "random_ltd":{ + "enabled": LTD_ENABLED, + "total_layer_num": 24, + "random_ltd_layer_num": 22, + "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22], + "model_mask_name": "attention_mask", + "model_type": "decoder", + "hidden_state_order": "seq_batch_dim", + "random_ltd_schedule": { + "min_value": LTD_MIN, + "max_value": LTD_MAX, + "schedule_type":"fixed_linear", + "schedule_config": { + "require_steps": LTD_STEP, + "seq_per_step": 16 + } + } + } + }, + "data_sampling": { + "enabled": CL_ENABLED, + "num_workers": DATA_SAMPLING_NUM_WORKERS, + "curriculum_learning": { + "enabled": CL_ENABLED, + "data_cluster_path": "CL_CLUSTER_PATH", + "curriculum_metrics": { + "CL_1st_METRIC_NAME": { + "index_to_sample_path": "CL_1st_SAMPLE_PATH", + "index_to_metric_path": "CL_1st_METRIC_PATH", + "difficulty_type": "CL_1st_DIFF_TYPE", + "clustering_type": "CL_1st_CLUSTER_TYPE", + "min_difficulty": CL_1st_MIN, + "max_difficulty": CL_1st_MAX, + "schedule_type": "fixed_root", + "schedule_config": { + "total_curriculum_step": CL_1st_TOTAL_STEP, + "difficulty_step": CL_1st_DIFF_STEP, + "root_degree": CL_1st_ROOT + } + }, + "CL_2nd_METRIC_NAME": { + "index_to_sample_path": "CL_2nd_SAMPLE_PATH", + "index_to_metric_path": "CL_2nd_METRIC_PATH", + "difficulty_type": "CL_2nd_DIFF_TYPE", + "clustering_type": "CL_2nd_CLUSTER_TYPE", + "min_difficulty": CL_2nd_MIN, + "max_difficulty": CL_2nd_MAX, + "schedule_type": "fixed_root", + "schedule_config": { + "total_curriculum_step": CL_2nd_TOTAL_STEP, + "difficulty_step": CL_2nd_DIFF_STEP, + "root_degree": CL_2nd_ROOT + } + } + } + } + } + } +} diff --git a/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh b/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh new file mode 100644 index 000000000..334989487 --- /dev/null +++ b/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh @@ -0,0 +1,515 @@ +#!/bin/bash +dir=`pwd` +############################################################################### +### Main configs +## GPT-3 models use 2K sequence length/context window +seq_len=2048 + +## The "GPT-3 XXX" below are configs from GPT-3 paper +## https://arxiv.org/abs/2005.14165, choose based on +## your desired model size or build your own configs + +## init_std is standard deviation for weight initialization. Usually larger +## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size) +## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf) + +## We changed min_lr to a lower number (1.0e-6), which we found is able to +## provide better zero-shot eval results. + +## GPT-3 Small 125M +# model_size=0.125 +# num_layers=12 +# hidden_size=768 +# num_attn_heads=12 +# global_batch_size=256 +# lr=6.0e-4 +# min_lr=1.0e-6 +# init_std=0.02 + +## GPT-3 Medium 350M +# model_size=0.35 +# num_layers=24 +# hidden_size=1024 +# num_attn_heads=16 +# global_batch_size=256 +# lr=3.0e-4 +# min_lr=1.0e-6 +# init_std=0.018 + +## GPT-3 Large 760M +# model_size=0.76 +# num_layers=24 +# hidden_size=1536 +# num_attn_heads=16 +# global_batch_size=256 +# lr=2.5e-4 +# min_lr=1.0e-6 +# init_std=0.015 + +## GPT-3 XL 1.3B +model_size=1.3 +num_layers=24 +hidden_size=2048 +num_attn_heads=16 +global_batch_size=512 +# lr=2.0e-4 +lr=$1 +min_lr=1.0e-6 +init_std=0.013 + +## GPT-3 2.7B +# model_size=2.7 +# num_layers=32 +# hidden_size=2560 +# num_attn_heads=32 +# global_batch_size=512 +# lr=1.6e-4 +# min_lr=1.0e-6 +# init_std=0.011 + +## GPT-3 6.7B +# model_size=6.7 +# num_layers=32 +# hidden_size=4096 +# num_attn_heads=32 +# global_batch_size=1024 +# lr=1.2e-4 +# min_lr=1.0e-6 +# init_std=0.009 + +## GPT-3 13B +# model_size=13 +# num_layers=40 +# hidden_size=5120 +# num_attn_heads=40 +# global_batch_size=1024 +# lr=1.0e-4 +# min_lr=1.0e-6 +# init_std=0.008 + +## GPT-3 175B +# model_size=175 +# num_layers=96 +# hidden_size=12288 +# num_attn_heads=96 +# global_batch_size=1536 +# lr=0.6e-4 +# min_lr=1.0e-6 +# init_std=0.005 +############################################################################### +### Training duration configs +## The main termination condition, original GPT-3 paper trains for 300B tokens. +# train_tokens_in_billion=300 +train_tokens_in_billion=$2 +train_tokens=$((${train_tokens_in_billion} * 1000000000)) + +## train_samples is another termination condition and also affect the number of +## data samples to be indexed. Since we want to reach the train_tokens +## above, and data efficiency techniques may change num tokens in some samples, +## so we just set this config large enough to make sure we have enough +## processed data and don't terminate by train_samples. +train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} )) + +## Another wall-clock time termination condition in minutes. Set it large +## enough to avoid undesired early termination. +exit_duration=30000000 +############################################################################### +### lr configs +## lr warmup and decay duration. +## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens. +## Here we increase the warmup tokens to 3B since when batch size warmup is not +## used, there are more tokens per step. Thus we need to increase warmup tokens +## to make sure there are enough warmup steps, which is important for training +## stability. +lr_warmup_tokens_in_million=3000 +lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000)) +## Here we changed the LR decay tokens to align with total train tokens, since +## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the +## learning rate schedule to match the number of training tokens results in the +## best final model quality +lr_decay_tokens_in_billion=${train_tokens_in_billion} +lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000)) +lr_decay_style="cosine" +############################################################################### +### Parallelism configs +## Model parallelism, 1 is no MP +mp_size=1 + +## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true. +## Note that currently both curriculum learning and random-LTD are NOT +## compatible with pipeline parallelism. +pp_size=1 +no_pp="true" + +## ZeRO-based data parallelism, stage=0 will disable ZeRO +zero_stage=1 + +## Total number of GPUs. ds_ssh is from DeepSpeed library. +num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2)) +num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) +num_node=$(( ${num_gpus} / ${num_gpus_pernode} )) + +## Data parallel size. +dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} )) + +## Micro batch size per GPU +## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus +## Reduce it manually if GPU OOM +batch_size=$(( ${global_batch_size} / ${dp_size} )) +############################################################################### +### Random layerwise token dropping (random-LTD) configs +## random-LTD's main switch. "false" means disabled. "true" means enabled. +ltd_enabled=${3:-'false'} +## How much dropping ratio to start with. The value denotes the seqlen after +## dropping. +ltd_start=${4:-2048} +## How many steps for random-LTD to gradually reduce dropping ratio to zero. +ltd_step=${5:-1} + +# ltd_enabled="true" +# ltd_start=128 +# ltd_step=200000 +############################################################################### +### Curriculum learning (CL) configs +## CL's main switch. "false" means disabled. "true" means enabled. +cl_enabled=${6:-'false'} +## Number of CL metrics to use. +cl_num_metric=${7:-1} + +## Name of difficulty metric +cl_1st_metric=${8:-'dummy'} +## Path to the data indexes for this difficulty metric. Samples on ith row of +## index_to_sample have the difficulty value equals to ith row of +## index_to_metric. +cl_1st_index_to_sample_path=${9:-'dummy'} +cl_1st_index_to_metric_path=${10:-'dummy'} +## During training, whether increase difficulty by value- or percentile-based. +cl_1st_difficulty_type=${11:-'value'} +## "single_cluster" means no clustering required and probably CL is achieved by +## data postprocessing. "schedule_based" means will cluster data based on the +## difficulty schedule (pacing function) below. +cl_1st_clustering_type=${12:-'single_cluster'} +## Start difficulty +cl_1st_min=${13:-2048} +## End difficulty +cl_1st_max=${14:-2048} +## Total step to reach end difficulty +cl_1st_total_step=${15:-1} +## When changing difficulty, always make sure it's a multiple of the +## difficulty_step below. +cl_1st_difficulty_step=${16:-1} +## Root degree of the schedule (pacing function). +cl_1st_root=${17:-1} + +cl_2nd_metric=${18:-'dummy'} +cl_2nd_index_to_sample_path=${19:-'dummy'} +cl_2nd_index_to_metric_path=${20:-'dummy'} +cl_2nd_difficulty_type=${21:-'value'} +cl_2nd_clustering_type=${22:-'single_cluster'} +cl_2nd_min=${23:-2048} +cl_2nd_max=${24:-2048} +cl_2nd_total_step=${25:-1} +cl_2nd_difficulty_step=${26:-1} +cl_2nd_root=${27:-1} + +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# ## The *_index_to_sample_percentile_merged is a concatenated index for perf +# ## improvement, but it only works when you set difficulty_type="percentile" in +# ## ds_config. If you use difficulty_type="value", you need to change this to +# ## *_index_to_sample +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=1 +# cl_1st_max=100 +# cl_1st_total_step=110000 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 + +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=80 +# cl_2nd_max=2048 +# cl_2nd_total_step=110000 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +############################################################################### +### Misc configs +log_interval=100 +eval_iters=10 +eval_interval=100 +# num_save controls how frequent to save checkpoint. num_save=20 means that a +# checkpoint will be saved every 5% of training. For longer training you would +# want larger num_save to save more frequently, and vice versa. +num_save=100 +estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size})) +save_interval=$((${estimated_train_iter} / ${num_save})) + +## Activation checkpointing saves GPU memory, but reduces training speed +activation_checkpoint="true" +# activation_checkpoint="false" + +## Whether or not log optimizer states (norms, max abs values) to tensorboard. +## This is not required for training and might save GPU memory when turned off. +log_optimizer_state="true" +############################################################################### +### Output and data configs +current_time=$(date "+%Y.%m.%d_%H.%M.%S") +host="${HOSTNAME}" +seed=1234 +num_workers=0 + +## Public the Pile dataset, can be downloaded at +## https://mystic.the-eye.eu/public/AI/pile_neox/ Change data_home to where you +## store the pile_text_document.bin and pile_text_document.idx. +data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing" +if [[ "$host" == *"webxt"* ]]; then + data_home="/blob/data/the_pile_public_merged_nopreprocessing" +fi +data_path="${data_home}/pile_text_document" +## *_idx_path force Megatron to use a specific data index file generated when +## we analyze data. This is needed because our index for curriculum learning +## difficulty metric is based on this data index. +doc_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_doc_idx.npy" +sample_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_sample_idx.npy" +shuffle_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_shuffle_idx.npy" + +vocab_path="gpt2-vocab.json" +if [ ! -f "$vocab_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json +fi +merge_path="gpt2-merges.txt" +if [ ! -f "$merge_path" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt +fi + +prescale_grad="true" +jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B" +jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}" +jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}" +if [[ $zero_stage -gt 0 ]]; then + jobname="${jobname}_z${zero_stage}" + prescale_grad="false" +fi +if [[ $mp_size -gt 1 ]]; then + jobname="${jobname}_mp${mp_size}" +fi +if [ "${no_pp}" = "false" ]; then + jobname="${jobname}_pp${pp_size}" +fi +jobname="${jobname}_seed${seed}" +if [ "${ltd_enabled}" = "true" ]; then + jobname="${jobname}_ltd_${ltd_start}_${ltd_step}" +fi +if [ "${cl_enabled}" = "true" ]; then + jobname="${jobname}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}" + if [[ $cl_num_metric -gt 1 ]]; then + jobname="${jobname}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}" + fi +fi + +username=$(whoami) +output_home="/blob/users/${username}/project/data_efficient_gpt" +log_path="${output_home}/log/" +checkpoint_path="${output_home}/checkpoint/${jobname}" +## Microsoft internal constraint: because tensorboard is logged by last rank, +## it's better to put the path in NFS instead of Blob. +tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/" +tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}" +mkdir -p ${log_path} +mkdir -p ${checkpoint_path} +mkdir -p ${tensorboard_path} +if [ "${cl_enabled}" = "true" ]; then + data_cluster_path="${output_home}/data_cluster/${jobname}" + mkdir -p ${data_cluster_path} +fi +############################################################################### +data_options=" \ + --vocab-file ${vocab_path} \ + --merge-file ${merge_path} \ + --data-path ${data_path} \ + --data-impl mmap" + +## If CL is used, make sure to set "--split" the same as what you used during +## offline data analysis&indexing. +megatron_options=" \ + --override-lr-scheduler \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --tensor-model-parallel-size ${mp_size} \ + --init-method-std ${init_std} \ + --lr-decay-tokens ${lr_decay_tokens} \ + --lr-warmup-tokens ${lr_warmup_tokens} \ + --micro-batch-size ${batch_size} \ + --exit-duration-in-mins ${exit_duration} \ + --global-batch-size ${global_batch_size} \ + --num-layers ${num_layers} \ + --hidden-size ${hidden_size} \ + --num-attention-heads ${num_attn_heads} \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --train-tokens ${train_tokens} \ + --train-samples ${train_samples} \ + --lr ${lr} \ + --min-lr ${min_lr} \ + --lr-decay-style ${lr_decay_style} \ + --split 949,50,1 \ + --log-interval ${log_interval} \ + --eval-interval ${eval_interval} \ + --eval-iters ${eval_iters} \ + --save-interval ${save_interval} \ + --weight-decay 0.1 \ + --clip-grad 1.0 \ + --hysteresis 2 \ + --num-workers ${num_workers} \ + --fp16 \ + --seed ${seed} \ + --load ${checkpoint_path} \ + --save ${checkpoint_path} \ + --tensorboard-queue-size 1 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + --tensorboard-dir ${tensorboard_path}" + +if [ "${activation_checkpoint}" = "true" ]; then +megatron_options="${megatron_options} \ + --checkpoint-activations" +fi + +if [ "${log_optimizer_state}" = "true" ]; then +megatron_options="${megatron_options} \ + --log-optimizer-states-to-tensorboard" +fi + +if [ "${ltd_enabled}" = "true" ]; then +megatron_options="${megatron_options} \ + --random-ltd" +fi + +if [ "${cl_enabled}" = "true" ]; then +megatron_options="${megatron_options} \ + --train-doc-idx-path ${doc_idx_path} \ + --train-sample-idx-path ${sample_idx_path} \ + --train-shuffle-idx-path ${shuffle_idx_path} \ + --data-efficiency-curriculum-learning" +fi + +config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_seed${seed}" +if [ "${ltd_enabled}" = "true" ]; then + config_json="${config_json}_ltd_${ltd_start}_${ltd_step}" +fi +if [ "${cl_enabled}" = "true" ]; then + config_json="${config_json}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}" + if [[ $cl_num_metric -gt 1 ]]; then + config_json="${config_json}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}" + fi +fi +config_json="${config_json}.json" +if [[ $cl_num_metric -gt 1 ]]; then +template_json="ds_config_gpt_2clmetrics_TEMPLATE.json" +sed "s/GBSIZE/${global_batch_size}/" ${template_json} \ + | sed "s/MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/${prescale_grad}/" \ + | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \ + | sed "s/LTD_ENABLED/${ltd_enabled}/" \ + | sed "s/LTD_MIN/${ltd_start}/" \ + | sed "s/LTD_MAX/${seq_len}/" \ + | sed "s/LTD_STEP/${ltd_step}/" \ + | sed "s/CL_ENABLED/${cl_enabled}/" \ + | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \ + | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \ + | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \ + | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \ + | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \ + | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \ + | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \ + | sed "s/CL_1st_MIN/${cl_1st_min}/" \ + | sed "s/CL_1st_MAX/${cl_1st_max}/" \ + | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \ + | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \ + | sed "s/CL_1st_ROOT/${cl_1st_root}/" \ + | sed "s#CL_2nd_METRIC_NAME#${cl_2nd_metric}#" \ + | sed "s#CL_2nd_SAMPLE_PATH#${cl_2nd_index_to_sample_path}#" \ + | sed "s#CL_2nd_METRIC_PATH#${cl_2nd_index_to_metric_path}#" \ + | sed "s#CL_2nd_DIFF_TYPE#${cl_2nd_difficulty_type}#" \ + | sed "s#CL_2nd_CLUSTER_TYPE#${cl_2nd_clustering_type}#" \ + | sed "s/CL_2nd_MIN/${cl_2nd_min}/" \ + | sed "s/CL_2nd_MAX/${cl_2nd_max}/" \ + | sed "s/CL_2nd_TOTAL_STEP/${cl_2nd_total_step}/" \ + | sed "s/CL_2nd_DIFF_STEP/${cl_2nd_difficulty_step}/" \ + | sed "s/CL_2nd_ROOT/${cl_2nd_root}/" \ + > ${config_json} +else +template_json="ds_config_gpt_1clmetric_TEMPLATE.json" +sed "s/GBSIZE/${global_batch_size}/" ${template_json} \ + | sed "s/MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/${prescale_grad}/" \ + | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \ + | sed "s/LTD_ENABLED/${ltd_enabled}/" \ + | sed "s/LTD_MIN/${ltd_start}/" \ + | sed "s/LTD_MAX/${seq_len}/" \ + | sed "s/LTD_STEP/${ltd_step}/" \ + | sed "s/CL_ENABLED/${cl_enabled}/" \ + | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \ + | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \ + | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \ + | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \ + | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \ + | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \ + | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \ + | sed "s/CL_1st_MIN/${cl_1st_min}/" \ + | sed "s/CL_1st_MAX/${cl_1st_max}/" \ + | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \ + | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \ + | sed "s/CL_1st_ROOT/${cl_1st_root}/" \ + > ${config_json} +fi + +deepspeed_options=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${zero_stage} \ + --pipeline-model-parallel-size ${pp_size}" + +if [[ "${no_pp}" = "true" ]]; then +deepspeed_options="${deepspeed_options} \ + --no-pipeline-parallel" +fi + +if [ "${activation_checkpoint}" = "true" ]; then +deepspeed_options="${deepspeed_options} \ + --deepspeed-activation-checkpointing" +fi + +## When saving checkpoint to a storage with cache, their could be consistency +## issue of the pointer to latest checkpoint. Here we find the correct pointer +## and broadcast it to all nodes. +iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt" +iteration_file_2="$checkpoint_path/latest" +iteration=0 +for (( node = 0; node <= num_node-1; node++ )) +do + if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then + local_iteration=$(ssh -q worker-"$node" cat $iteration_file) + iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} : ${iteration} )) + fi +done +if [[ $iteration -gt 0 ]]; then + iteration_2="global_step${iteration}" + ds_ssh "echo $iteration > $iteration_file" + ds_ssh "echo $iteration_2 > $iteration_file_2" +fi + +deepspeed ${dir}/../../../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log \ No newline at end of file diff --git a/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh b/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh new file mode 100644 index 000000000..8878c1792 --- /dev/null +++ b/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh @@ -0,0 +1,366 @@ +############################################################################### +### Each block below is one pretraining setup. Uncomment one block to try. +############################################################################### +### Baseline cases, mostly based on OpenAI's GPT-3 hyperparameters, but with +### some changes (without batch size warmup, and different LR schedule). +## Baseline 300B tokens (100%): +# lr=2.0e-4 +# train_tokens_in_billion=300 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} +############################################################################### +## Baseline 200B tokens (67%): +# lr=3.0e-4 # scaled based on train token reduction ratio +# train_tokens_in_billion=200 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} +############################################################################### +## Baseline 150B tokens (50%): +# lr=4.0e-4 +# train_tokens_in_billion=150 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} +############################################################################### +### Curriculum learning (CL) + Random layerwise token dropping (random-LTD). +### DeepSpeed Data Efficiency's best composed solution. +## CL+random-LTD 300B tokens (100%): +# lr=2.0e-4 +# train_tokens_in_billion=300 +# ltd_enabled="true" +# ltd_start=128 +# ltd_step=200000 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=1 +# cl_1st_max=100 +# cl_1st_total_step=110000 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=80 +# cl_2nd_max=2048 +# cl_2nd_total_step=110000 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +## CL+random-LTD 150B tokens (50%): +# lr=4.0e-4 +# train_tokens_in_billion=150 +# ltd_enabled="true" +# ltd_start=128 +# ltd_step=100000 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=1 +# cl_1st_max=100 +# cl_1st_total_step=55000 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=80 +# cl_2nd_max=2048 +# cl_2nd_total_step=55000 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +### Random layerwise token dropping (random-LTD). +## random-LTD 300B tokens (100%): +# lr=2.0e-4 +# train_tokens_in_billion=300 +# ltd_enabled="true" +# ltd_start=128 +# ltd_step=200000 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} +############################################################################### +## random-LTD 200B tokens (67%): +# lr=3.0e-4 +# train_tokens_in_billion=200 +# ltd_enabled="true" +# ltd_start=128 +# ltd_step=133333 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} +############################################################################### +## random-LTD 150B tokens (50%): +# lr=4.0e-4 +# train_tokens_in_billion=150 +# ltd_enabled="true" +# ltd_start=128 +# ltd_step=100000 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} +############################################################################### +### Curriculum learning (CL). +## CL vocab rarity + seqlen truncation 300B tokens (100%): +# lr=2.0e-4 +# train_tokens_in_billion=300 +# ltd_enabled="false" +# ltd_start=2048 +# ltd_step=1 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=1 +# cl_1st_max=100 +# cl_1st_total_step=110000 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=80 +# cl_2nd_max=2048 +# cl_2nd_total_step=110000 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +## CL vocab rarity + seqlen truncation 200B tokens (67%): +# lr=3.0e-4 +# train_tokens_in_billion=200 +# ltd_enabled="false" +# ltd_start=2048 +# ltd_step=1 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=1 +# cl_1st_max=100 +# cl_1st_total_step=73000 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=80 +# cl_2nd_max=2048 +# cl_2nd_total_step=73000 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +## CL vocab rarity + seqlen truncation 150B tokens (50%): +# lr=4.0e-4 +# train_tokens_in_billion=150 +# ltd_enabled="false" +# ltd_start=2048 +# ltd_step=1 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=1 +# cl_1st_max=100 +# cl_1st_total_step=55000 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=80 +# cl_2nd_max=2048 +# cl_2nd_total_step=55000 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +## CL vocab rarity + seqlen reshape 300B tokens (100%): +# lr=2.0e-4 +# train_tokens_in_billion=300 +# ltd_enabled="false" +# ltd_start=2048 +# ltd_step=1 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=1 +# cl_1st_max=100 +# cl_1st_total_step=110000 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_reshape" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=80 +# cl_2nd_max=2048 +# cl_2nd_total_step=110000 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +## CL vocab rarity 300B tokens (100%): +# lr=2.0e-4 +# train_tokens_in_billion=300 +# ltd_enabled="false" +# ltd_start=2048 +# ltd_step=1 +# cl_enabled="true" +# cl_num_metric=1 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=1 +# cl_1st_max=100 +# cl_1st_total_step=110000 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} +############################################################################### +## CL seqlen truncation 300B tokens (100%): +# lr=2.0e-4 +# train_tokens_in_billion=300 +# ltd_enabled="false" +# ltd_start=2048 +# ltd_step=1 +# cl_enabled="true" +# cl_num_metric=1 +# cl_1st_metric="seqlen_truncate" +# cl_1st_index_to_sample_path="dummy" +# cl_1st_index_to_metric_path="dummy" +# cl_1st_difficulty_type="value" +# cl_1st_clustering_type="single_cluster" +# cl_1st_min=80 +# cl_1st_max=2048 +# cl_1st_total_step=110000 +# cl_1st_difficulty_step=8 +# cl_1st_root=1 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} +############################################################################### +## CL seqlen reshape 300B tokens (100%): +# lr=2.0e-4 +# train_tokens_in_billion=300 +# ltd_enabled="false" +# ltd_start=2048 +# ltd_step=1 +# cl_enabled="true" +# cl_num_metric=1 +# cl_1st_metric="seqlen_reshape" +# cl_1st_index_to_sample_path="dummy" +# cl_1st_index_to_metric_path="dummy" +# cl_1st_difficulty_type="value" +# cl_1st_clustering_type="single_cluster" +# cl_1st_min=80 +# cl_1st_max=2048 +# cl_1st_total_step=110000 +# cl_1st_difficulty_step=8 +# cl_1st_root=1 +# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \ +# ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} +############################################################################### \ No newline at end of file diff --git a/megatron/arguments.py b/megatron/arguments.py index 1d82427a3..4c4e00dcd 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -248,7 +248,7 @@ def parse_args(extra_args_provider=None, defaults={}, 'for distribute-checkpointed-activations to work you '\ 'need to enable checkpoint-activations' - args.curriculum_learning = False + args.curriculum_learning_legacy = False args.compression_training = False # AML @@ -444,6 +444,9 @@ def _add_training_args(parser): group.add_argument('--train-tokens', type=int, default=None, help='Total number of tokens to train over all ' 'training runs.') + group.add_argument('--random-ltd', + action='store_true', + help='enable random layer token drop') group.add_argument('--log-interval', type=int, default=100, help='Report loss and timing interval.') group.add_argument('--exit-interval', type=int, default=None, @@ -748,7 +751,21 @@ def _add_data_args(parser): 'end-of-document token.') group.add_argument('--eod-mask-loss', action='store_true', help='Mask loss for the end of document tokens.') - + group.add_argument('--train-data-exact-num-epochs', type=int, default=None, + help='When building the train dataset, force it to be ' + 'an exact number of epochs of the raw data') + group.add_argument('--return-data-index', action='store_true', + help='Return the index of data sample.') + group.add_argument('--data-efficiency-curriculum-learning', action='store_true', + help='Use DeepSpeed data efficiency library curriculum learning feature.') + group.add_argument('--train-idx-path', type=str, default=None, + help='Force to use certain index file.') + group.add_argument('--train-doc-idx-path', type=str, default=None, + help='Force to use certain index file.') + group.add_argument('--train-sample-idx-path', type=str, default=None, + help='Force to use certain index file.') + group.add_argument('--train-shuffle-idx-path', type=str, default=None, + help='Force to use certain index file.') return parser diff --git a/megatron/data/bert_dataset.py b/megatron/data/bert_dataset.py index 916a3be06..24965f7ca 100644 --- a/megatron/data/bert_dataset.py +++ b/megatron/data/bert_dataset.py @@ -73,13 +73,14 @@ def __len__(self): return self.samples_mapping.shape[0] def __getitem__(self, idx): + args = get_args() start_idx, end_idx, seq_length = self.samples_mapping[idx] sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)] # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1 np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32)) - return build_training_sample(sample, seq_length, + train_sample = build_training_sample(sample, seq_length, self.max_seq_length, # needed for padding self.vocab_id_list, self.vocab_id_to_token_dict, @@ -87,6 +88,9 @@ def __getitem__(self, idx): self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.binary_head) + if args.return_data_index: + train_sample['index'] = np.array([idx], dtype=np.int64) + return train_sample diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index 3052e9fdd..cf8ccb9fd 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -224,7 +224,7 @@ def create_masked_lm_predictions(tokens, if masked_lm_prob == 0: return (output_tokens, masked_lm_positions, - masked_lm_labels, token_boundary) + masked_lm_labels, token_boundary, None) num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))) @@ -640,27 +640,39 @@ def get_samples_mapping(indexed_dataset, name, binary_head): """Get a list that maps a sample index to a starting sentence index, end sentence index, and length""" - - if not num_epochs: - if not max_num_samples: - raise ValueError("Need to specify either max_num_samples " - "or num_epochs") - num_epochs = np.iinfo(np.int32).max - 1 - if not max_num_samples: + args = get_args() + if args.train_data_exact_num_epochs is not None and name == 'train': + num_epochs = args.train_data_exact_num_epochs max_num_samples = np.iinfo(np.int64).max - 1 + else: + if not num_epochs: + if not max_num_samples: + raise ValueError("Need to specify either max_num_samples " + "or num_epochs") + num_epochs = np.iinfo(np.int32).max - 1 + if not max_num_samples: + max_num_samples = np.iinfo(np.int64).max - 1 # Filename of the index mapping indexmap_filename = data_prefix indexmap_filename += '_{}_indexmap'.format(name) - if num_epochs != (np.iinfo(np.int32).max - 1): - indexmap_filename += '_{}ep'.format(num_epochs) - if max_num_samples != (np.iinfo(np.int64).max - 1): - indexmap_filename += '_{}mns'.format(max_num_samples) + if args.train_data_exact_num_epochs is not None and name == 'train': + indexmap_filename += '_exact{}ep'.format(num_epochs) + else: + if num_epochs != (np.iinfo(np.int32).max - 1): + indexmap_filename += '_{}ep'.format(num_epochs) + if max_num_samples != (np.iinfo(np.int64).max - 1): + indexmap_filename += '_{}mns'.format(max_num_samples) indexmap_filename += '_{}msl'.format(max_seq_length) indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) indexmap_filename += '_{}s'.format(seed) indexmap_filename += '.npy' + if name == 'train': + # force to use certain index files + if args.train_idx_path is not None: + indexmap_filename = args.train_idx_path + # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0 and \ not os.path.isfile(indexmap_filename): @@ -699,12 +711,13 @@ def get_samples_mapping(indexed_dataset, # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case - counts = torch.cuda.LongTensor([1]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) - assert counts[0].item() == ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + if torch.cuda.device_count() > 0: # Skip when CPU-only + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) # Load indexed dataset. print_rank_0(' > loading indexed mapping from {}'.format( diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 0c9058454..5625f4586 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -21,7 +21,7 @@ import numpy as np import torch -from megatron import mpu, is_rank_0, print_rank_0 +from megatron import mpu, is_rank_0, print_rank_0, get_args from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples from megatron.data.dataset_utils import get_train_valid_test_split_ @@ -159,6 +159,8 @@ def __len__(self): return self.sample_idx.shape[0] - 1 def __getitem__(self, idx): + args = get_args() + orig_idx = idx # Get the shuffled index. idx = self.shuffle_idx[idx] # Start and end documents and offsets. @@ -183,7 +185,8 @@ def __getitem__(self, idx): self.doc_idx[doc_index_l], length=offset_l + 1)) sample = np.concatenate(sample_list) - + if args.return_data_index: + return {'text': np.array(sample, dtype=np.int64), 'index': np.array([orig_idx], dtype=np.int64)} return {'text': np.array(sample, dtype=np.int64)} @@ -195,6 +198,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes, training sample. shuffle-idx: maps the sample index into a random index into sample-idx. """ + args = get_args() # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) @@ -204,13 +208,26 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # Filename of the index mappings. _filename = data_prefix _filename += '_{}_indexmap'.format(name) - _filename += '_{}ns'.format(num_samples) + if args.train_data_exact_num_epochs is not None and name == 'train': + num_epochs = args.train_data_exact_num_epochs + _filename += '_exact{}ep'.format(num_epochs) + else: + _filename += '_{}ns'.format(num_samples) _filename += '_{}sl'.format(seq_length) _filename += '_{}s'.format(seed) doc_idx_filename = _filename + '_doc_idx.npy' sample_idx_filename = _filename + '_sample_idx.npy' shuffle_idx_filename = _filename + '_shuffle_idx.npy' + if name == 'train': + # force to use certain index files + if args.train_doc_idx_path is not None: + doc_idx_filename = args.train_doc_idx_path + if args.train_sample_idx_path is not None: + sample_idx_filename = args.train_sample_idx_path + if args.train_shuffle_idx_path is not None: + shuffle_idx_filename = args.train_shuffle_idx_path + # Build the indexed mapping if not exist. if is_rank_0(): if (not os.path.isfile(doc_idx_filename)) or \ diff --git a/megatron/initialize.py b/megatron/initialize.py index 014bfa073..5ed7dc089 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -134,7 +134,8 @@ def _compile_dependencies(): if _is_rank_0(): start_time = time.time() print('> compiling and loading fused kernels ...', flush=True) - fused_kernels.load(args) + if torch.cuda.device_count() > 0: # Skip when CPU-only + fused_kernels.load(args) torch.distributed.barrier() else: torch.distributed.barrier() @@ -205,7 +206,7 @@ def _initialize_distributed(): else: args.local_rank = device - torch.cuda.set_device(device) + torch.cuda.set_device(device) # only do so when device_count > 0 # Call the init process init_method = 'tcp://' @@ -247,7 +248,11 @@ def _set_random_seed(seed_): """Set random seed for reproducability.""" if seed_ is not None and seed_ > 0: # Ensure that different pipeline MP stages get different seeds. - seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank()) + # No need to do so for CPU-only case. + if torch.cuda.device_count() == 0: + seed = seed_ + else: + seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank()) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index f31d1a12c..62b951064 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -114,7 +114,7 @@ def forward(self, input_ids, position_ids, attention_mask, labels=None, # attention_mask has size [1, 1, seqlen, seqlen] attention_mask = attention_mask[:, :, :curriculum_seqlen, :curriculum_seqlen].contiguous() else: - if args.curriculum_learning: + if args.curriculum_learning_legacy: # If got a None input, need to reset curriculum_seqlen on user side args.curriculum_seqlen = args.seq_length diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index 2ca337c8e..2c6802a74 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -427,7 +427,7 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, def state_dict_for_save_checkpoint(self, destination=None, prefix='', keep_vars=False): """For easy load.""" - + args = get_args() state_dict_ = {} moe_state_dict = {} if self.pre_process: @@ -436,6 +436,15 @@ def state_dict_for_save_checkpoint(self, destination=None, prefix='', destination, prefix, keep_vars) encoder_state_dict = self.encoder.state_dict_for_save_checkpoint( destination, prefix, keep_vars) + if args.random_ltd: + # When using random-LTD, it is required to call remove_random_ltd_state_dict + # during model checkpoint saving to transfer the random-LTD-wrapped + # layers back to original layers. This will help to remove the dependency + # to random-LTD inside the checkpoint, so that during evaluation or + # finetuning of the checkpoint there is no need to depend on random-LTD + # again. + from deepspeed.runtime.data_pipeline.data_routing.helper import remove_random_ltd_state_dict + encoder_state_dict = remove_random_ltd_state_dict(encoder_state_dict) # MoE states need to be handled separately by DeepSpeed engine, thus # moving them to the top level dictionary # If components other than encoder may contain MoE states, need to add diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 7b1fd0e78..24b46d5c7 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -464,7 +464,7 @@ def __init__(self, init_method, output_layer_init_method, drop_tokens=args.moe_token_dropping, use_tutel=args.use_tutel, enable_expert_tensor_parallelism=enable_expert_tensor_parallelism) - def forward(self, hidden_states, attention_mask, + def forward(self, hidden_states, attention_mask=None, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False): # hidden_states: [b, s, h] @@ -706,7 +706,7 @@ def custom_forward(*inputs): moe_losses = [] for index in range(start, end): layer = self._get_layer(index) - x_, moe_loss = layer(x_, attention_mask, encoder_output, enc_dec_attn_mask) + x_, moe_loss = layer(x_, attention_mask=attention_mask, encoder_output=encoder_output, enc_dec_attn_mask=enc_dec_attn_mask) moe_losses.append(moe_loss) return (x_, *moe_losses) return custom_forward @@ -779,7 +779,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None, if layer_past is not None: past = layer_past[index] hidden_states = layer(hidden_states, - attention_mask, + attention_mask=attention_mask, encoder_output=encoder_output, enc_dec_attn_mask=enc_dec_attn_mask, layer_past=past, diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 492f1f10b..7f7c37fb3 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -41,7 +41,7 @@ def init_(tensor): def attention_mask_func(attention_scores, attention_mask): args = get_args() - if args.curriculum_learning: + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: attention_mask_ = attention_mask actual_seqlen = attention_scores.size()[2] if actual_seqlen != attention_mask_.size()[2]: diff --git a/megatron/training.py b/megatron/training.py index f02df5ba2..92d4ba2a8 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -57,6 +57,8 @@ from deepspeed.compression.compress import init_compression, redundancy_clean +from megatron.model.transformer import ParallelTransformerLayer +from deepspeed.runtime.data_pipeline.data_routing.helper import convert_to_random_ltd def print_datetime(string): """Note that this call will sync across all ranks.""" @@ -69,7 +71,8 @@ def pretrain(train_valid_test_dataset_provider, model_provider, forward_step_func, extra_args_provider=None, - args_defaults={}): + args_defaults={}, + data_post_process=None): """Main training program. This function will run the followings in the order provided: @@ -118,9 +121,9 @@ def pretrain(train_valid_test_dataset_provider, open(args.deepspeed_config, 'r', encoding='utf-8')) if "curriculum_learning" in args.deepspeed_configuration and \ "enabled" in args.deepspeed_configuration["curriculum_learning"]: - args.curriculum_learning = args.deepspeed_configuration[ \ + args.curriculum_learning_legacy = args.deepspeed_configuration[ \ "curriculum_learning"]["enabled"] - if args.curriculum_learning and not args.no_pipeline_parallel: + if args.curriculum_learning_legacy and not args.no_pipeline_parallel: from deepspeed.runtime.data_pipeline.curriculum_scheduler \ import CurriculumScheduler args.curriculum_scheduler = CurriculumScheduler( \ @@ -130,7 +133,9 @@ def pretrain(train_valid_test_dataset_provider, # Model, optimizer, and learning rate. timers('model-and-optimizer-setup').start() - model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider, teacher=False) + model, optimizer, lr_scheduler = setup_model_and_optimizer( + model_provider, teacher=False, data_post_process=data_post_process, + build_train_valid_test_datasets_provider=train_valid_test_dataset_provider) timers('model-and-optimizer-setup').stop() print_datetime('after model, optimizer, and learning rate ' 'scheduler are built') @@ -149,6 +154,18 @@ def pretrain(train_valid_test_dataset_provider, train_data_iterator, valid_data_iterator, test_data_iterator \ = build_train_valid_test_data_iterators( train_valid_test_dataset_provider) + if args.data_efficiency_curriculum_learning: + if args.deepspeed_dataloader is not None: + # We use args to pass the deepspeed_dataloader because adding + # output to setup_model_and_optimizer will break the API for other + # cases. We clear args.deepspeed_dataloader after updating + # train_data_iterator because args will be saved in checkpoint and + # attempting to save the whole deepspeed_dataloader will lead to + # "AttributeError: Can't pickle local object...". + train_data_iterator = iter(args.deepspeed_dataloader) + args.deepspeed_dataloader = None + else: + train_data_iterator = None timers('train/valid/test-data-iterators-setup').stop() print_datetime('after dataloaders are built') @@ -411,7 +428,8 @@ def load_model_weights_only(model_provider_func): return model, optimizer, lr_scheduler -def setup_model_and_optimizer(model_provider_func, teacher=False): +def setup_model_and_optimizer(model_provider_func, teacher=False, + data_post_process=None, build_train_valid_test_datasets_provider=None): """Setup model and optimizer.""" args = get_args() @@ -461,13 +479,47 @@ def setup_model_and_optimizer(model_provider_func, teacher=False): if args.deepspeed: print_rank_0("DeepSpeed is enabled.") pp = mpu.get_pipeline_model_parallel_world_size() - model, optimizer, _, lr_scheduler = deepspeed.initialize( - model=model[0], - optimizer=optimizer, - args=args, - lr_scheduler=lr_scheduler, - mpu=mpu if args.no_pipeline_parallel else None - ) + if args.data_efficiency_curriculum_learning and build_train_valid_test_datasets_provider is not None: + train_ds = None + # Only need to build dataset on tp rank 0 since Megatron has the + # broadcast_data() function that broadcast data from tp rank 0. + if mpu.get_tensor_model_parallel_rank() == 0: + # Number of train/valid/test samples. + if args.train_samples: + train_samples = args.train_samples + else: + train_samples = args.train_iters * args.global_batch_size + # eval_iters and test_iters here are not actually used, only for + # satisfying the input of build_train_valid_test_datasets_provider. + # We only need to build the training data here. And we follow + # baseline's logic to build eval/test dataset later in + # build_train_valid_test_data_iterators. + eval_iters = (args.train_iters // args.eval_interval + 1) * \ + args.eval_iters + test_iters = args.eval_iters + train_val_test_num_samples = [train_samples, + eval_iters * args.global_batch_size, + test_iters * args.global_batch_size] + # Build the datasets. + train_ds, _, _ = build_train_valid_test_datasets_provider( + train_val_test_num_samples) + model, optimizer, args.deepspeed_dataloader, lr_scheduler = deepspeed.initialize( + model=model[0], + optimizer=optimizer, + args=args, + lr_scheduler=lr_scheduler, + training_data=train_ds, + mpu=mpu if args.no_pipeline_parallel else None + ) + model.set_data_post_process_func(data_post_process) + else: + model, optimizer, _, lr_scheduler = deepspeed.initialize( + model=model[0], + optimizer=optimizer, + args=args, + lr_scheduler=lr_scheduler, + mpu=mpu if args.no_pipeline_parallel else None + ) if isinstance(model, deepspeed.PipelineEngine): # hack to get batch_fn from pretrain_gpt.py model.set_batch_fn(model.module._megatron_batch_fn) @@ -507,6 +559,10 @@ def setup_model_and_optimizer(model_provider_func, teacher=False): if args.fp16: optimizer.reload_model_params() + # random-LTD requires converting transformer layers + if args.random_ltd: + model[0] = convert_to_random_ltd(model[0], ParallelTransformerLayer) + return model, optimizer, lr_scheduler @@ -744,9 +800,27 @@ def add_to_logging(name): args.consumed_train_samples) writer.add_scalar('params-norm/params-norm vs tokens', params_norm, args.consumed_train_tokens) - if args.curriculum_learning: - writer.add_scalar('curriculum_seqlen', args.curriculum_seqlen, + if hasattr(args, 'actual_seq_length'): + writer.add_scalar('seqlen/actual_seq_length', args.actual_seq_length, + iteration) + writer.add_scalar('seqlen/actual_seq_length vs samples', args.actual_seq_length, + args.consumed_train_samples) + writer.add_scalar('seqlen/actual_seq_length vs tokens', args.actual_seq_length, + args.consumed_train_tokens) + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + writer.add_scalar('seqlen/curriculum_seqlen', args.curriculum_seqlen, + iteration) + writer.add_scalar('seqlen/curriculum_seqlen vs samples', args.curriculum_seqlen, + args.consumed_train_samples) + writer.add_scalar('seqlen/curriculum_seqlen vs tokens', args.curriculum_seqlen, + args.consumed_train_tokens) + if args.random_ltd: + writer.add_scalar('seqlen/random_ltd_reserved_length', args.random_ltd_reserved_length, iteration) + writer.add_scalar('seqlen/random_ltd_reserved_length vs samples', args.random_ltd_reserved_length, + args.consumed_train_samples) + writer.add_scalar('seqlen/random_ltd_reserved_length vs tokens', args.random_ltd_reserved_length, + args.consumed_train_tokens) if args.log_timers_to_tensorboard: timers.write(timers_to_log, writer, iteration, normalizer=total_iterations) @@ -825,7 +899,9 @@ def add_to_logging(name): if iteration % args.log_interval == 0: elapsed_time = timers('interval-time').elapsed() elapsed_time_per_iteration = elapsed_time / total_iterations - seq_len = args.curriculum_seqlen if args.curriculum_learning else args.seq_length + seq_len = args.seq_length + if hasattr(args, 'actual_seq_length'): + seq_len = args.actual_seq_length hidden_size = args.hidden_size num_layers = args.num_layers vocab_size = args.padded_vocab_size @@ -871,8 +947,11 @@ def add_to_logging(name): log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad) if params_norm is not None: log_string += ' params norm: {:.3f} |'.format(params_norm) - if args.curriculum_learning: + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: log_string += ' curriculum seqlen: {:5d} |'.format(args.curriculum_seqlen) + if args.random_ltd: + log_string += ' random ltd reserved length: {:5d} |'.format(args.random_ltd_reserved_length) + log_string += ' actual seqlen: {:5d} |'.format(seq_len) log_string += ' number of skipped iterations: {:3d} |'.format( total_loss_dict[skipped_iters_key]) log_string += ' number of nan iterations: {:3d} |'.format( @@ -915,6 +994,11 @@ def train(forward_step_func, model, optimizer, lr_scheduler, # Write args to tensorboard write_args_to_tensorboard() + if args.random_ltd: + # random-ltd requires different randomness on each rank + import random + random.seed(args.seed + torch.distributed.get_rank()) + # Turn on training mode which enables dropout. for model_module in model: model_module.train() @@ -928,6 +1012,10 @@ def train(forward_step_func, model, optimizer, lr_scheduler, timers('interval-time').start() print_datetime('before the start of training step') report_memory_flag = True + if args.random_ltd: + assert model[0].random_ltd_enabled() + args.random_ltd_layer_num = model[0].random_ltd_scheduler.get_random_ltd_layer_num() + while iteration < args.train_iters and (args.train_tokens is None or \ args.consumed_train_tokens < args.train_tokens): update_num_microbatches(args.consumed_train_samples) @@ -938,7 +1026,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler, get_num_microbatches() model[0].set_train_batch_size(global_batch_size) - if args.curriculum_learning and not args.no_pipeline_parallel: + if args.curriculum_learning_legacy and not args.no_pipeline_parallel: args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \ args.iteration + 1) loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \ @@ -953,14 +1041,25 @@ def train(forward_step_func, model, optimizer, lr_scheduler, args.micro_batch_size * \ get_num_microbatches() args.consumed_train_samples += new_samples - if not args.custom_token_counting: - # Models like BERT have padding thus need special token counting. - # See example in ../../pretrain_bert.py. - if args.curriculum_learning: - args.consumed_train_tokens += new_samples * args.curriculum_seqlen + # This actual_seq_length is used for actual consumed tokens calculation, flops calculation, and logging. + args.actual_seq_length = args.seq_length + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + args.actual_seq_length = args.curriculum_seqlen + if args.random_ltd: + args.random_ltd_reserved_length = model[0].random_ltd_scheduler.get_current_seq() + if args.random_ltd_reserved_length < args.actual_seq_length: + args.actual_seq_length = (args.actual_seq_length * (args.num_layers - args.random_ltd_layer_num) + args.random_ltd_reserved_length * args.random_ltd_layer_num) // args.num_layers + if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning: + if hasattr(args, 'data_efficiency_curriculum_learning_numel'): + act_mbsz = args.data_efficiency_curriculum_learning_numel / args.curriculum_seqlen + act_token = act_mbsz * args.actual_seq_length + args.consumed_train_tokens += mpu.get_data_parallel_world_size() * \ + get_num_microbatches() * act_token else: - args.consumed_train_tokens += new_samples * args.seq_length - + args.consumed_train_tokens += new_samples * args.actual_seq_length + else: + args.consumed_train_tokens += new_samples * args.actual_seq_length + # Logging. if args.deepspeed: if hasattr(model[0].optimizer, 'cur_scale'): @@ -1037,7 +1136,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False): for model_module in model: model_module.eval() - if args.curriculum_learning and not args.no_pipeline_parallel: + if args.curriculum_learning_legacy and not args.no_pipeline_parallel: # When curriculum learning is used with pipeline parallelism, we need # this logic to ensure that the eval data is not truncated. If there # is a seqlen change due to that, we need to call @@ -1093,7 +1192,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False): for key in total_loss_dict: total_loss_dict[key] /= args.eval_iters * get_num_microbatches() - if args.curriculum_learning and not args.no_pipeline_parallel: + if args.curriculum_learning_legacy and not args.no_pipeline_parallel: # roll back to actual curriculum seqlen at the end of eval. args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \ args.iteration + 1) diff --git a/megatron/utils.py b/megatron/utils.py index 59a0a12f3..99c9438bc 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -231,7 +231,10 @@ def throughput_calculator(model, args, iteration_time, total_iterations): # The factor of 4 is when used with activation check-pointing, # otherwise it will be 3. checkpoint_activations_factor = 4 if args.checkpoint_activations else 3 - flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * args.seq_length * num_layers * (hidden_size**2)) * (1. + (args.seq_length / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size))) + seq_len = args.seq_length + if hasattr(args, 'actual_seq_length'): + seq_len = args.actual_seq_length + flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (1. + (seq_len / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size))) tflops = flops_per_iteration / (elapsed_time_per_iter * args.world_size * (10**12)) return samples_per_second, tflops, approx_parameters_in_billions diff --git a/pretrain_bert.py b/pretrain_bert.py index df5be7a06..c550d27e9 100644 --- a/pretrain_bert.py +++ b/pretrain_bert.py @@ -17,6 +17,7 @@ from functools import partial +import math import torch import torch.nn.functional as F @@ -72,6 +73,28 @@ def get_batch(data_iterator): return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask +def data_post_process(data, data_sampler_state_dict): + args = get_args() + if args.data_efficiency_curriculum_learning: + if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']: + effective_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate'] + else: + effective_seqlen = torch.count_nonzero(data['padding_mask'], dim=1) + effective_seqlen = torch.max(effective_seqlen).to(torch.cuda.current_device()) + torch.distributed.all_reduce(effective_seqlen, + op=torch.distributed.ReduceOp.MAX, + group=mpu.get_data_parallel_group()) + effective_seqlen = effective_seqlen.item() + # Has to be multiple of 8 to enable Tensor Core acceleration + if effective_seqlen % 8 != 0: + effective_seqlen = math.ceil(effective_seqlen / 8) * 8 + if effective_seqlen < args.seq_length: + data['text'] = data['text'][:, :effective_seqlen].contiguous() + data['types'] = data['types'][:, :effective_seqlen].contiguous() + data['loss_mask'] = data['loss_mask'][:, :effective_seqlen].contiguous() + data['labels'] = data['labels'][:, :effective_seqlen].contiguous() + data['padding_mask'] = data['padding_mask'][:, :effective_seqlen].contiguous() + return data def loss_func(loss_mask, sentence_order, output_tensor): lm_loss_, sop_logits = output_tensor @@ -110,10 +133,8 @@ def forward_step(data_iterator, model): data_iterator) timers('batch-generator').stop() - effective_train_tokens = torch.count_nonzero(padding_mask) - torch.distributed.all_reduce(effective_train_tokens, - group=mpu.get_data_parallel_group()) - args.consumed_train_tokens += effective_train_tokens.item() + if args.data_efficiency_curriculum_learning: + args.curriculum_seqlen = tokens.size()[1] if not args.bert_binary_head: types = None @@ -150,4 +171,5 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): if __name__ == "__main__": pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) + args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}, + data_post_process=data_post_process) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index f57b479a2..369152c8d 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -16,6 +16,7 @@ """Pretrain GPT""" import torch +import math from functools import partial from megatron import get_args from megatron import print_rank_0 @@ -116,6 +117,30 @@ def get_batch(data_iterator): return tokens, labels, loss_mask, attention_mask, position_ids +def data_post_process(data, data_sampler_state_dict): + args = get_args() + if args.data_efficiency_curriculum_learning: + if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']: + args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_truncate' + current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate'] + if current_seqlen < args.seq_length: + data['text'] = data['text'][:, :(current_seqlen+1)].contiguous() + elif 'seqlen_reshape' in data_sampler_state_dict['current_difficulties']: + args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_reshape' + current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_reshape'] + if current_seqlen < args.seq_length: + orig_num_token = torch.numel(data['text']) + reshape_len = (data['text'].size()[1] // (current_seqlen+1)) * (current_seqlen+1) + data['text'] = torch.cat((data['text'][:, :reshape_len].contiguous().view(-1, current_seqlen+1), + data['text'][:, -(current_seqlen+1):]), 0).contiguous() + num_row = math.ceil(orig_num_token / (current_seqlen+1)) + num_row = min(num_row, data['text'].size()[0]) + if num_row > 1 and num_row % 2 != 0: + num_row -= 1 + data['text'] = data['text'][:num_row, :].contiguous() + else: + args.data_efficiency_curriculum_learning_seqlen_type = None + return data def get_batch_pipe(data): """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" @@ -141,7 +166,7 @@ def get_batch_pipe(data): args.reset_position_ids, args.reset_attention_mask, args.eod_mask_loss) - if args.curriculum_learning and args.curriculum_seqlen < tokens.size()[1]: + if args.curriculum_learning_legacy and args.curriculum_seqlen < tokens.size()[1]: # seqlen-based curriculum learning # tokens, position_ids, labels, loss_mask have size [batch size, seqlen] tokens = tokens[:, :args.curriculum_seqlen].contiguous() @@ -184,7 +209,7 @@ def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, at if teacher_model: with torch.no_grad(): - if args.curriculum_learning and args.curriculum_seqlen < args.seq_length: + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: assert args.curriculum_seqlen is not None curriculum_seqlen = args.curriculum_seqlen tokens = tokens[:, :curriculum_seqlen].contiguous() @@ -213,17 +238,23 @@ def forward_step(data_iterator, model): data_iterator) timers('batch-generator').stop() + if args.data_efficiency_curriculum_learning: + args.curriculum_seqlen = tokens.size()[1] + if hasattr(args, 'data_efficiency_curriculum_learning_seqlen_type') and \ + args.data_efficiency_curriculum_learning_seqlen_type == 'seqlen_reshape': + args.data_efficiency_curriculum_learning_numel = torch.numel(tokens) + if args.mos or args.kd: # The forward func can return either the loss or the logits, depending on whether passing in the labels or not. stu_output, *other_losses = model(tokens, position_ids, attention_mask) - if args.curriculum_learning and args.curriculum_seqlen < args.seq_length: + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: assert args.curriculum_seqlen is not None labels = labels[:, :args.curriculum_seqlen].contiguous() output_tensor = mpu.vocab_parallel_cross_entropy(stu_output.contiguous().float(), labels) else: output_tensor, *other_losses = model(tokens, position_ids, attention_mask, labels=labels) - if args.curriculum_learning and args.curriculum_seqlen < args.seq_length: + if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length: loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() moe_losses = [] @@ -292,4 +323,5 @@ def git_ds_info(): if __name__ == "__main__": git_ds_info() pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + data_post_process=data_post_process) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index e5a852187..ca28e9ee3 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -394,6 +394,7 @@ def tasks_args(parser): group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.') group.add_argument('--adaptive_seq_len', default = False, action='store_true', help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.') + group.add_argument('--num_fewshot', type=int, default = 0, help='Number of few-shot prompts.') group.add_argument('--eval_fp32', default = False, action='store_true', help='Should the evaluation run in fp32') return parser @@ -408,7 +409,7 @@ def main(): # adaptive_seq_len hack #1: # CL automatically enables reset_activation_shape() which allows us to change input shapes # and it also reshapes the attenion scores in attention_mask_func - args.curriculum_learning = 1 + args.curriculum_learning_legacy = 1 task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',') task_dict = tasks.get_task_dict(task_list) @@ -419,7 +420,7 @@ def main(): tokenizer = get_tokenizer() adaptor = EvalHarnessAdaptor(model, tokenizer) - results = evaluator.evaluate(adaptor, task_dict, False, 0, None) + results = evaluator.evaluate(adaptor, task_dict, False, args.num_fewshot, None) if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: print(json.dumps(results, indent=2)) From 515798f809669124fe178a7aa45af8e127f35a4d Mon Sep 17 00:00:00 2001 From: Conglong Li Date: Wed, 14 Dec 2022 11:05:04 -0800 Subject: [PATCH 06/11] fix floating point in script (#101) --- .../pretrain/ds_pretrain_bert_336M_run.sh | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh index 46c6c48b5..f03c65ccf 100644 --- a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh +++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh @@ -17,7 +17,7 @@ # train_iters_in_million=2 # ltd_enabled="true" # ltd_start=200 -# ltd_step_in_million=1.8 +# ltd_step_in_million=18e-1 # dropout=0 # cl_enabled="true" # cl_num_metric=2 @@ -28,7 +28,7 @@ # cl_1st_clustering_type="schedule_based" # cl_1st_min=600 # cl_1st_max=9069 -# cl_1st_total_step_in_million=0.96 +# cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=1 # cl_1st_root=2 # cl_2nd_metric="seqlen_truncate" @@ -38,7 +38,7 @@ # cl_2nd_clustering_type="single_cluster" # cl_2nd_min=128 # cl_2nd_max=512 -# cl_2nd_total_step_in_million=0.96 +# cl_2nd_total_step_in_million=96e-2 # cl_2nd_difficulty_step=8 # cl_2nd_root=1 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ @@ -55,10 +55,10 @@ ### Random layerwise token dropping (random-LTD). ## random-LTD 723B tokens (69%): # lr=1.45e-4 -# train_iters_in_million=1.38 +# train_iters_in_million=138e-2 # ltd_enabled="true" # ltd_start=200 -# ltd_step_in_million=1.8 +# ltd_step_in_million=18e-1 # dropout=0 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ # ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} @@ -66,11 +66,11 @@ ### Curriculum learning (CL). ## CL vocab rarity 734B tokens (70%): # lr=1.4e-4 -# train_iters_in_million=1.4 +# train_iters_in_million=14e-1 # ltd_enabled="false" # ltd_start=512 # ltd_step_in_million=1 -# dropout=0.1 +# dropout=1e-1 # cl_enabled="true" # cl_num_metric=1 # cl_1st_metric="voc" @@ -80,7 +80,7 @@ # cl_1st_clustering_type="schedule_based" # cl_1st_min=600 # cl_1st_max=9069 -# cl_1st_total_step_in_million=0.7 +# cl_1st_total_step_in_million=7e-1 # cl_1st_difficulty_step=1 # cl_1st_root=2 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ @@ -97,7 +97,7 @@ # ltd_enabled="false" # ltd_start=512 # ltd_step_in_million=1 -# dropout=0.1 +# dropout=1e-1 # cl_enabled="true" # cl_num_metric=2 # cl_1st_metric="voc" @@ -107,7 +107,7 @@ # cl_1st_clustering_type="schedule_based" # cl_1st_min=600 # cl_1st_max=9069 -# cl_1st_total_step_in_million=0.96 +# cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=1 # cl_1st_root=2 # cl_2nd_metric="seqlen_truncate" @@ -117,7 +117,7 @@ # cl_2nd_clustering_type="single_cluster" # cl_2nd_min=128 # cl_2nd_max=512 -# cl_2nd_total_step_in_million=0.96 +# cl_2nd_total_step_in_million=96e-2 # cl_2nd_difficulty_step=8 # cl_2nd_root=1 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ @@ -137,7 +137,7 @@ # ltd_enabled="false" # ltd_start=512 # ltd_step_in_million=1 -# dropout=0.1 +# dropout=1e-1 # cl_enabled="true" # cl_num_metric=1 # cl_1st_metric="seqlenvocabrarity" @@ -147,7 +147,7 @@ # cl_1st_clustering_type="schedule_based" # cl_1st_min=5 # cl_1st_max=100 -# cl_1st_total_step_in_million=0.96 +# cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=1 # cl_1st_root=2 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ @@ -164,7 +164,7 @@ # ltd_enabled="false" # ltd_start=512 # ltd_step_in_million=1 -# dropout=0.1 +# dropout=1e-1 # cl_enabled="true" # cl_num_metric=1 # cl_1st_metric="voc" @@ -174,7 +174,7 @@ # cl_1st_clustering_type="schedule_based" # cl_1st_min=600 # cl_1st_max=9069 -# cl_1st_total_step_in_million=0.96 +# cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=1 # cl_1st_root=2 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ @@ -191,7 +191,7 @@ # ltd_enabled="false" # ltd_start=512 # ltd_step_in_million=1 -# dropout=0.1 +# dropout=1e-1 # cl_enabled="true" # cl_num_metric=1 # cl_1st_metric="seqlen_truncate" @@ -201,7 +201,7 @@ # cl_1st_clustering_type="single_cluster" # cl_1st_min=128 # cl_1st_max=512 -# cl_1st_total_step_in_million=0.96 +# cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=8 # cl_1st_root=1 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ @@ -218,7 +218,7 @@ # ltd_enabled="false" # ltd_start=512 # ltd_step_in_million=1 -# dropout=0.1 +# dropout=1e-1 # cl_enabled="true" # cl_num_metric=1 # cl_1st_metric="seqlen" @@ -228,7 +228,7 @@ # cl_1st_clustering_type="single_cluster" # cl_1st_min=5 # cl_1st_max=100 -# cl_1st_total_step_in_million=0.96 +# cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=8 # cl_1st_root=2 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ From 9a52a2e4720a437af557bfccb207616b2b1217e2 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Mon, 19 Dec 2022 09:56:03 -0800 Subject: [PATCH 07/11] added unit test for megatron (#102) --- MANIFEST.in | 1 + tests/run_megatron.py | 113 +++++++++++++++++++++++++++++++++++++++++ tests/test_megatron.py | 61 ++++++++++++++++++++++ 3 files changed, 175 insertions(+) create mode 100644 tests/run_megatron.py create mode 100644 tests/test_megatron.py diff --git a/MANIFEST.in b/MANIFEST.in index f44791183..d6ca373a7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include megatron/data/Makefile include megatron/data/helpers.cpp +recursive-include megatron/fused_kernels *.cpp *.h *.cu *.tr *.cuh *.cc diff --git a/tests/run_megatron.py b/tests/run_megatron.py new file mode 100644 index 000000000..ad96165b5 --- /dev/null +++ b/tests/run_megatron.py @@ -0,0 +1,113 @@ +import torch +import deepspeed +import megatron +from megatron import get_args +from megatron import mpu +from megatron.checkpointing import load_checkpoint +from megatron.initialize import initialize_megatron +from megatron.model import GPTModel +from megatron.training import get_model +from megatron.text_generation_utils import generate_samples_eval + + +def model_provider(pre_process=True, post_process=True): + model = GPTModel( + num_tokentypes=0, + parallel_output=False, + pre_process=pre_process, + post_process=post_process, + return_moe_loss=False, + ) + return model + + +def add_text_generate_args(parser): + """Text generation arguments.""" + group = parser.add_argument_group(title="text generation") + + group.add_argument( + "--temperature", type=float, default=1.0, help="Sampling temperature." + ) + group.add_argument( + "--greedy", action="store_true", default=False, help="Use greedy sampling." + ) + group.add_argument("--top_p", type=float, default=0.0, help="Top p sampling.") + group.add_argument("--top_k", type=int, default=0, help="Top k sampling.") + group.add_argument( + "--out-seq-length", + type=int, + default=1024, + help="Size of the output generated text.", + ) + group.add_argument( + "--sample-input-file", + type=str, + default=None, + help="Get input from file instead of interactive mode, " + "each line is an input.", + ) + group.add_argument( + "--sample-output-file", + type=str, + default=None, + help="Output file got from --sample-input-file", + ) + group.add_argument( + "--num-samples", + type=int, + default=0, + help="Number of samples to generate unconditionally, " + "defaults to 0 and interactive conditional sampling", + ) + group.add_argument( + "--genfile", type=str, help="Output file when generating unconditionally" + ) + group.add_argument( + "--recompute", + action="store_true", + help="During generation recompute all attention " + "instead of using previously computed keys/values.", + ) + group.add_argument( + "--context-tokens", type=str, default="DeepSpeed is the greatest" + ) + group.add_argument("--max-tokens", type=int, default=50) + + return parser + + +if __name__ == "__main__": + # initialize megatron + initialize_megatron( + extra_args_provider=add_text_generate_args, + args_defaults={ + "tokenizer_type": "GPT2BPETokenizer", + "no_load_rng": True, + "no_load_optim": True, + }, + ) + args = get_args() + + # setup model + model = get_model(model_provider) + _ = load_checkpoint(model, None, None) + model = model[0] + if args.ds_inference: + engine = deepspeed.init_inference( + model=model, + mp_size=args.tensor_model_parallel_size, + tensor_parallel={"mpu": mpu}, + dtype=torch.half, + replace_with_kernel_inject=True, + moe_experts=args.num_experts, + moe_type=args.mlp_type, + ) + model = engine.module + + # generate output + generate_samples_eval( + model, args.context_tokens, 1, 0 + ) # Just so we don't get log output from DeepSpeed (this should be removed once we improve logging in DeepSpeed) + print("===START OUTPUT===") + print(generate_samples_eval(model, args.context_tokens, args.max_tokens, 0)) + print("===END OUTPUT===") diff --git a/tests/test_megatron.py b/tests/test_megatron.py new file mode 100644 index 000000000..e7342c244 --- /dev/null +++ b/tests/test_megatron.py @@ -0,0 +1,61 @@ +import pytest +import os +import re +import subprocess + + +@pytest.fixture(params=[1]) +def moe_num_experts(request): + return str(request.param) + + +@pytest.fixture(params=[1]) +def mp_size(request): + return str(request.param) + + +@pytest.fixture +def params(moe_num_experts, mp_size): + base_dir = os.getenv("MEGATRON_CKPT_DIR") + assert base_dir, "Please set MEGATRON_CKPT_DIR in your environment" + + vocab_file = os.path.join(base_dir, "gpt2-vocab.json") + merge_file = os.path.join(base_dir, "gpt2-merges.txt") + ckpt_path = os.path.join(base_dir, "checkpoints/gpt2_345m") + + return [ + "--micro-batch-size", "1", + "--num-layers", "24", + "--hidden-size", "1024", + "--num-attention-heads", "16", + "--max-position-embeddings", "1024", + "--vocab-file", vocab_file, + "--merge-file", merge_file, + "--load", ckpt_path, + "--seq-length", "1024", + "--out-seq-length", "1024", + "--tensor-model-parallel-size", mp_size, + "--tokenizer-type", "GPT2BPETokenizer", + "--num-experts", moe_num_experts, + "--mlp-type", "standard", + "--num-samples", "0", + "--fp16", + ] + + +def test_moe_megatron(params, mp_size): + output_re = r"===START OUTPUT===([\S\s]*)===END OUTPUT===" + + # Run the baseline + baseline_cmd = ["deepspeed", "--num_gpus", mp_size, "./run_megatron.py"] + params + result = subprocess.run(baseline_cmd, stdout=subprocess.PIPE) + baseline_output = re.search(output_re, result.stdout.decode("utf-8")).group(1) + + # Run with DeepSpeed + deepspeed_cmd = baseline_cmd + ["--ds-inference"] + result = subprocess.run(deepspeed_cmd, stdout=subprocess.PIPE) + deepspeed_output = re.search(output_re, result.stdout.decode("utf-8")).group(1) + + assert ( + baseline_output == deepspeed_output + ), f"outputs do not match: {baseline_output}\n{deepspeed_output}" From 789f2a9047d565e47c53a813d296775387eb961f Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 20 Dec 2022 15:42:47 -0800 Subject: [PATCH 08/11] fix deprecated numpy types (#103) --- megatron/data/indexed_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index 7f1379671..20344f889 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -98,8 +98,8 @@ def write_longs(f, a): 3: np.int16, 4: np.int32, 5: np.int64, - 6: np.float, - 7: np.double, + 6: np.float32, + 7: np.float64, 8: np.uint16 } @@ -271,8 +271,8 @@ class IndexedDatasetBuilder(object): np.int16: 2, np.int32: 4, np.int64: 8, - np.float: 4, - np.double: 8 + np.float32: 4, + np.float64: 8 } def __init__(self, out_file, dtype=np.int32): From c24020485dc9199995c8eb78fdb1c743a4605441 Mon Sep 17 00:00:00 2001 From: Conglong Li Date: Fri, 23 Dec 2022 12:59:24 -0800 Subject: [PATCH 09/11] fix script typo --- .../bert/pretrain/ds_pretrain_bert_336M_base_script.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh index 551ca3118..b5a59f324 100644 --- a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh +++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh @@ -300,7 +300,7 @@ data_options=" \ megatron_options=" \ --override-lr-scheduler \ --adam-beta1 0.9 \ - --adam-beta2 0.95 \ + --adam-beta2 0.999 \ --tensor-model-parallel-size ${mp_size} \ --init-method-std ${init_std} \ --lr-decay-tokens ${lr_decay_tokens} \ @@ -469,4 +469,4 @@ if [[ $iteration -gt 0 ]]; then ds_ssh "echo $iteration_2 > $iteration_file_2" fi -deepspeed ${dir}/../../../../pretrain_bert.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log \ No newline at end of file +deepspeed ${dir}/../../../../pretrain_bert.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log From 178beeb45d8f89c926366659900934351aa007e6 Mon Sep 17 00:00:00 2001 From: mzl Date: Mon, 30 Jan 2023 23:12:06 +0800 Subject: [PATCH 10/11] Use accelerator for supporting other device (#104) * xpu support (#55) * port accel abs interfece * WA for run3.6b * move on * fix current_dievice * fix typo * enable to run 345M GPT * delete apex_patch * add TODO xpu compatible tg for xpu WA * use deepspeed launcher * enable run3.6b bf16 * add zero2 config json * readd enable_each_rank_log * fix typos * add ccl arg * fix * use short word * use no-masked-softmax-fusion * readd * set train iters to 10 * remove duplicate line * change assert msg * update format * add whitespace * update path * update note * update * fix typos * delete notes * update format * update xpu check to cuda check * update * clean up file * fix typos * add python based gradient clipping * change condition for python based path --- megatron/__init__.py | 4 +- megatron/arguments.py | 2 +- megatron/checkpointing.py | 6 +-- megatron/data/biencoder_dataset_utils.py | 4 +- megatron/data/dataset_utils.py | 6 +-- megatron/data/gpt_dataset.py | 4 +- megatron/data/realm_dataset_utils.py | 4 +- megatron/global_vars.py | 6 +-- megatron/initialize.py | 22 +++++----- megatron/memory.py | 4 +- megatron/model/__init__.py | 8 ++-- megatron/model/distributed.py | 4 +- megatron/model/fused_softmax.py | 1 - megatron/model/module.py | 8 ++-- megatron/model/realm_model.py | 6 +-- megatron/model/transformer.py | 3 +- megatron/model/vit_model.py | 3 +- megatron/mpu/data.py | 8 ++-- megatron/mpu/layers.py | 12 +++--- megatron/mpu/random.py | 35 ++++++++-------- megatron/mpu/tests/commons.py | 6 +-- megatron/mpu/tests/test_cross_entropy.py | 9 +++-- megatron/mpu/tests/test_data.py | 3 +- megatron/mpu/tests/test_layers.py | 48 +++++++++++----------- megatron/mpu/tests/test_random.py | 22 +++++----- megatron/optimizer/__init__.py | 9 ++++- megatron/optimizer/clip_grads.py | 51 ++++++++++++++---------- megatron/optimizer/grad_scaler.py | 12 +++--- megatron/optimizer/optimizer.py | 34 +++++++++------- megatron/p2p_communication.py | 8 ++-- megatron/text_generation_utils.py | 39 +++++++++--------- megatron/training.py | 33 +++++++-------- megatron/utils.py | 35 +++++++++------- pretrain_gpt.py | 3 +- pretrain_ict.py | 10 ++--- pretrain_vit.py | 4 +- tasks/eval_harness/evaluate.py | 2 +- tasks/eval_utils.py | 3 +- tasks/finetune_utils.py | 10 ++--- tasks/orqa/evaluate_utils.py | 12 +++--- tasks/orqa/natural_questions/nq.py | 9 +++-- tasks/vision/eval_utils.py | 4 +- tasks/vision/finetune_utils.py | 6 +-- tasks/zeroshot_gpt/evaluate.py | 6 +-- 44 files changed, 282 insertions(+), 246 deletions(-) diff --git a/megatron/__init__.py b/megatron/__init__.py index c5b9b3af6..93894cd3f 100644 --- a/megatron/__init__.py +++ b/megatron/__init__.py @@ -14,7 +14,7 @@ # limitations under the License. import os import torch - +from deepspeed.accelerator import get_accelerator from .package_info import ( __description__, __contact_names__, @@ -64,7 +64,7 @@ def is_rank_0(): """Check whether it is rank 0. For AML, check if it is rank 0 of a node""" if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0 or ( - is_aml() and torch.distributed.get_rank() % torch.cuda.device_count() == 0 + is_aml() and torch.distributed.get_rank() % get_accelerator().device_count() == 0 ): return True else: diff --git a/megatron/arguments.py b/megatron/arguments.py index 4c4e00dcd..bdd17450b 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -653,7 +653,7 @@ def _add_distributed_args(parser): group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None, help='Number of layers per virtual pipeline stage') group.add_argument('--distributed-backend', default='nccl', - choices=['nccl', 'gloo'], + choices=['nccl', 'gloo', 'ccl'], help='Which backend to use for distributed training.') group.add_argument('--DDP-impl', default='local', choices=['local', 'torch'], diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 52c2a84da..fbef9da00 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -19,7 +19,7 @@ import random import sys import numpy as np - +from deepspeed.accelerator import get_accelerator import torch from megatron import (get_args, @@ -150,7 +150,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler): state_dict['random_rng_state'] = random.getstate() state_dict['np_rng_state'] = np.random.get_state() state_dict['torch_rng_state'] = torch.get_rng_state() - state_dict['cuda_rng_state'] = torch.cuda.get_rng_state() + state_dict['cuda_rng_state'] = get_accelerator().get_rng_state() state_dict['rng_tracker_states'] \ = mpu.get_cuda_rng_tracker().get_states() @@ -417,7 +417,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True random.setstate(state_dict['random_rng_state']) np.random.set_state(state_dict['np_rng_state']) torch.set_rng_state(state_dict['torch_rng_state']) - torch.cuda.set_rng_state(state_dict['cuda_rng_state']) + get_accelerator().set_rng_state(state_dict['cuda_rng_state']) # Check for empty states array if not state_dict['rng_tracker_states']: raise KeyError diff --git a/megatron/data/biencoder_dataset_utils.py b/megatron/data/biencoder_dataset_utils.py index f7b3b961b..e22cc7e89 100644 --- a/megatron/data/biencoder_dataset_utils.py +++ b/megatron/data/biencoder_dataset_utils.py @@ -8,7 +8,7 @@ from megatron.data.dataset_utils import create_masked_lm_predictions, \ pad_and_convert_to_numpy from megatron.data.data_samplers import MegatronPretrainingSampler - +from deepspeed.accelerator import get_accelerator def make_attention_mask(source_block, target_block): """ Returns a 2-dimensional (2-D) attention mask @@ -187,7 +187,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case - counts = torch.cuda.LongTensor([1]) + counts = get_accelerator().LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) assert counts[0].item() == torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index cf8ccb9fd..8cb7f85eb 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -33,7 +33,7 @@ ) from megatron.data.blendable_dataset import BlendableDataset from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - +from deepspeed.accelerator import get_accelerator DSET_TYPE_BERT = 'standard_bert' DSET_TYPE_ICT = 'ict' DSET_TYPE_T5 = 't5' @@ -711,8 +711,8 @@ def get_samples_mapping(indexed_dataset, # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case - if torch.cuda.device_count() > 0: # Skip when CPU-only - counts = torch.cuda.LongTensor([1]) + if get_accelerator().device_count() > 0: # Skip when CPU-only + counts = get_accelerator().LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) assert counts[0].item() == ( diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 5625f4586..a14179bb9 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -20,7 +20,7 @@ import numpy as np import torch - +from deepspeed.accelerator import get_accelerator from megatron import mpu, is_rank_0, print_rank_0, get_args from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples @@ -313,7 +313,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes, # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case - counts = torch.cuda.LongTensor([1]) + counts = get_accelerator().LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) assert counts[0].item() == ( diff --git a/megatron/data/realm_dataset_utils.py b/megatron/data/realm_dataset_utils.py index aecf5549a..b7d4a40c6 100644 --- a/megatron/data/realm_dataset_utils.py +++ b/megatron/data/realm_dataset_utils.py @@ -7,7 +7,7 @@ from megatron import mpu, print_rank_0 from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy from megatron import get_args, get_tokenizer, print_rank_0, mpu - +from deepspeed.accelerator import get_accelerator def get_one_epoch_dataloader(dataset, micro_batch_size=None): """Specifically one epoch to be used in an indexing job.""" @@ -177,7 +177,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case - counts = torch.cuda.LongTensor([1]) + counts = get_accelerator().LongTensor([1]) torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) assert counts[0].item() == torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) diff --git a/megatron/global_vars.py b/megatron/global_vars.py index c486f0d6b..21b647dac 100644 --- a/megatron/global_vars.py +++ b/megatron/global_vars.py @@ -24,7 +24,7 @@ from megatron.tokenizer import build_tokenizer from .arguments import parse_args from .microbatches import build_num_microbatches_calculator - +from deepspeed.accelerator import get_accelerator _GLOBAL_ARGS = None _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None _GLOBAL_TOKENIZER = None @@ -192,14 +192,14 @@ def __init__(self, name): def start(self): """Start the timer.""" assert not self.started_, 'timer has already been started' - torch.cuda.synchronize() + get_accelerator().synchronize() self.start_time = time.time() self.started_ = True def stop(self): """Stop the timer.""" assert self.started_, 'timer is not started' - torch.cuda.synchronize() + get_accelerator().synchronize() self.elapsed_ += (time.time() - self.start_time) self.started_ = False diff --git a/megatron/initialize.py b/megatron/initialize.py index 5ed7dc089..4b7e11a6c 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -21,7 +21,6 @@ import numpy as np import torch - from megatron import fused_kernels from megatron import get_adlr_autoresume from megatron import get_args @@ -30,7 +29,7 @@ from megatron.global_vars import set_global_variables from megatron.mpu import (set_tensor_model_parallel_rank, set_tensor_model_parallel_world_size) - +from deepspeed.accelerator import get_accelerator import deepspeed import deepspeed.utils.groups as groups @@ -46,7 +45,7 @@ def initialize_megatron(extra_args_provider=None, args_defaults={}, """ if not allow_no_cuda: # Make sure cuda is available. - assert torch.cuda.is_available(), 'Megatron requires CUDA.' + assert get_accelerator().is_available(), 'Megatron requires accelerator.' # Parse args, build tokenizer, and set adlr-autoresume, # tensorboard-writer, and timers. @@ -107,7 +106,10 @@ def _compile_dependencies(): compile_helper() print('>>> done with dataset index builder. Compilation time: {:.3f} ' 'seconds'.format(time.time() - start_time), flush=True) - + + if not get_accelerator().device_name() == 'cuda': + print(">fused kernel is only supported in cuda, skip loading fused kernel") + return # ================== # Load fused kernels # ================== @@ -134,7 +136,7 @@ def _compile_dependencies(): if _is_rank_0(): start_time = time.time() print('> compiling and loading fused kernels ...', flush=True) - if torch.cuda.device_count() > 0: # Skip when CPU-only + if get_accelerator().device_count() > 0: # Skip when CPU-only fused_kernels.load(args) torch.distributed.barrier() else: @@ -185,7 +187,7 @@ def setup_deepspeed_random_and_activation_checkpointing(args): def _initialize_distributed(): """Initialize torch.distributed and mpu.""" args = get_args() - device_count = torch.cuda.device_count() + device_count = get_accelerator().device_count() if torch.distributed.is_initialized(): if args.rank == 0: @@ -206,7 +208,7 @@ def _initialize_distributed(): else: args.local_rank = device - torch.cuda.set_device(device) # only do so when device_count > 0 + get_accelerator().set_device(device) # only do so when device_count > 0 # Call the init process init_method = 'tcp://' @@ -249,14 +251,14 @@ def _set_random_seed(seed_): if seed_ is not None and seed_ > 0: # Ensure that different pipeline MP stages get different seeds. # No need to do so for CPU-only case. - if torch.cuda.device_count() == 0: + if get_accelerator().device_count() == 0: seed = seed_ else: seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank()) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - if torch.cuda.device_count() > 0: + if get_accelerator().device_count() > 0: mpu.model_parallel_cuda_manual_seed(seed) else: raise ValueError('Seed ({}) should be a positive integer.'.format(seed)) @@ -284,7 +286,7 @@ def _is_rank_0(): """Check whether it is rank 0. For AML, check if it is rank 0 of a node""" if torch.distributed.is_initialized(): if torch.distributed.get_rank() == 0 or ( - 'AZUREML_EXPERIMENT_ID' in os.environ and torch.distributed.get_rank() % torch.cuda.device_count() == 0 + 'AZUREML_EXPERIMENT_ID' in os.environ and torch.distributed.get_rank() % get_accelerator().device_count() == 0 ): return True else: diff --git a/megatron/memory.py b/megatron/memory.py index be5a117bc..3b06d60d4 100644 --- a/megatron/memory.py +++ b/megatron/memory.py @@ -15,7 +15,7 @@ import torch - +from deepspeed.accelerator import get_accelerator # A dictionary of all the memory buffers allocated. _MEM_BUFFS = dict() @@ -58,7 +58,7 @@ def __init__(self, name, numel, dtype, track_usage): self.dtype = dtype self.data = torch.empty(self.numel, dtype=self.dtype, - device=torch.cuda.current_device(), + device=get_accelerator().current_device_name(), requires_grad=False) # Index tracking the start of the free memory. diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index baf54e455..a2f3be016 100644 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -12,9 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm - +from deepspeed.accelerator.real_accelerator import get_accelerator +if get_accelerator().device_name() == 'cuda': + from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm +else: + from torch.nn import LayerNorm from .distributed import DistributedDataParallel from .bert_model import BertModel from .gpt_model import GPTModel, GPTModelPipe diff --git a/megatron/model/distributed.py b/megatron/model/distributed.py index 53d336223..ad86345c7 100644 --- a/megatron/model/distributed.py +++ b/megatron/model/distributed.py @@ -22,7 +22,7 @@ from megatron import get_args from megatron import mpu from .module import MegatronModule - +from deepspeed.accelerator import get_accelerator class MemoryBuffer: @@ -32,7 +32,7 @@ def __init__(self, numel, dtype): self.dtype = dtype self.data = torch.zeros(self.numel, dtype=self.dtype, - device=torch.cuda.current_device(), + device=get_accelerator().current_device_name(), requires_grad=False) diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 097b29ef4..8d52967d9 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -154,7 +154,6 @@ def forward(self, input, mask): input = input * self.scale mask_output = self.mask_func(input, mask) if mask is not None else input probs = torch.nn.Softmax(dim=-1)(mask_output) - if self.input_in_float16 and self.softmax_in_fp32: if self.input_in_fp16: probs = probs.half() diff --git a/megatron/model/module.py b/megatron/model/module.py index df92d95a9..9f91c8bd1 100644 --- a/megatron/model/module.py +++ b/megatron/model/module.py @@ -18,14 +18,14 @@ import torch from torch.autograd import Variable from torch.nn.parameter import Parameter - +from deepspeed.accelerator import get_accelerator from megatron import get_args from megatron import mpu -_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) -_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) -_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor) +_FLOAT_TYPES = (torch.FloatTensor, get_accelerator().FloatTensor) +_HALF_TYPES = (torch.HalfTensor, get_accelerator().HalfTensor) +_BF16_TYPES = (torch.BFloat16Tensor, get_accelerator().BFloat16Tensor) diff --git a/megatron/model/realm_model.py b/megatron/model/realm_model.py index 5730a85e3..e74eb2e58 100644 --- a/megatron/model/realm_model.py +++ b/megatron/model/realm_model.py @@ -12,7 +12,7 @@ from megatron.model.language_model import get_language_model from megatron.model.utils import scaled_init_method_normal from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids - +from deepspeed.accelerator import get_accelerator def general_ict_model_provider(only_query_model=False, only_block_model=False): """Build the model.""" @@ -72,7 +72,7 @@ def forward(self, query_tokens, query_attention_mask, block_tokens, block_attent def embed_query(self, query_tokens, query_attention_mask): """Embed a batch of tokens using the query model""" if self.use_query_model: - query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0) + query_types = get_accelerator().LongTensor(*query_tokens.shape).fill_(0) query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types) return query_ict_logits else: @@ -81,7 +81,7 @@ def embed_query(self, query_tokens, query_attention_mask): def embed_block(self, block_tokens, block_attention_mask): """Embed a batch of tokens using the block model""" if self.use_block_model: - block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0) + block_types = get_accelerator().LongTensor(*block_tokens.shape).fill_(0) block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types) return block_ict_logits else: diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 24b46d5c7..692eac3d9 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -29,6 +29,7 @@ from torch import distributed as dist import deepspeed from deepspeed.moe.layer import MoE +from deepspeed.accelerator import get_accelerator # flags required to enable jit fusion kernels torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) @@ -275,7 +276,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None, output_size[2], output_size[3], dtype=query_layer.dtype, - device=torch.cuda.current_device()) + device=get_accelerator().current_device_name()) # Raw attention scores. [b * np, sq, sk] matmul_result = torch.baddbmm( diff --git a/megatron/model/vit_model.py b/megatron/model/vit_model.py index 84a52a829..ae3d56281 100644 --- a/megatron/model/vit_model.py +++ b/megatron/model/vit_model.py @@ -27,6 +27,7 @@ scaled_init_method_normal, ) from .module import MegatronModule +from deepspeed.accelerator import get_accelerator class VitMlpHead(MegatronModule): @@ -164,7 +165,7 @@ def __init__(self, num_classes, finetune=False): init_method_normal(args.init_method_std)( self.position_embeddings.weight ) - self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda() + self.position_ids = torch.arange(self.seq_length).expand(1, -1).to(get_accelerator().device_name()) self.position_embeddings._register_load_state_dict_pre_hook( twod_interpolate_position_embeddings_hook diff --git a/megatron/mpu/data.py b/megatron/mpu/data.py index dd57a8ffc..679955aa1 100644 --- a/megatron/mpu/data.py +++ b/megatron/mpu/data.py @@ -18,7 +18,7 @@ from .initialize import get_tensor_model_parallel_group from .initialize import get_tensor_model_parallel_rank from .initialize import get_tensor_model_parallel_src_rank - +from deepspeed.accelerator import get_accelerator _MAX_DATA_DIM = 5 @@ -46,7 +46,7 @@ def _build_key_size_numel_dictionaries(keys, data): offset += max_dim # Move to GPU and broadcast. - sizes_cuda = torch.cuda.LongTensor(sizes) + sizes_cuda = get_accelerator().LongTensor(sizes) torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()) @@ -94,10 +94,10 @@ def broadcast_data(keys, data, datatype): _check_data_types(keys, data, datatype) # Flatten the data associated with the keys flatten_data = torch.cat( - [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() + [data[key].contiguous().view(-1) for key in keys], dim=0).to(get_accelerator().device_name()) else: flatten_data = torch.empty(total_numel, - device=torch.cuda.current_device(), + device=get_accelerator().current_device_name(), dtype=datatype) # Broadcast diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 0d81d5622..5d168c672 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -37,7 +37,7 @@ from .utils import VocabUtility from megatron import get_args import deepspeed.runtime.activation_checkpointing.checkpointing as ds_checkpointing - +from deepspeed.accelerator import get_accelerator _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False, 'partition_dim': -1, @@ -176,7 +176,7 @@ def __init__(self, num_embeddings, embedding_dim, else: self.weight = Parameter(torch.empty( self.num_embeddings_per_partition, self.embedding_dim, - device=torch.cuda.current_device(), dtype=args.params_dtype)) + device=get_accelerator().current_device_name(), dtype=args.params_dtype)) _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1) @@ -264,7 +264,7 @@ def __init__(self, input_size, output_size, bias=True, gather_output=True, else: self.weight = Parameter(torch.empty( self.output_size_per_partition, self.input_size, - device=torch.cuda.current_device(), dtype=args.params_dtype)) + device=get_accelerator().current_device_name(), dtype=args.params_dtype)) _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=stride) @@ -275,7 +275,7 @@ def __init__(self, input_size, output_size, bias=True, gather_output=True, else: self.bias = Parameter(torch.empty( self.output_size_per_partition, - device=torch.cuda.current_device(), + device=get_accelerator().current_device_name(), dtype=args.params_dtype)) set_tensor_model_parallel_attributes(self.bias, True, 0, stride) # Always initialize bias to zero. @@ -375,7 +375,7 @@ def __init__(self, input_size, output_size, bias=True, else: self.weight = Parameter(torch.empty( self.output_size, self.input_size_per_partition, - device=torch.cuda.current_device(), dtype=args.params_dtype)) + device=get_accelerator().current_device_name(), dtype=args.params_dtype)) _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=1, stride=stride) if bias: @@ -384,7 +384,7 @@ def __init__(self, input_size, output_size, bias=True, dtype=args.params_dtype)) else: self.bias = Parameter(torch.empty( - self.output_size, device=torch.cuda.current_device(), + self.output_size, device=get_accelerator().current_device_name(), dtype=args.params_dtype)) # Always initialize bias to zero. with torch.no_grad(): diff --git a/megatron/mpu/random.py b/megatron/mpu/random.py index 2a57316a1..beea791b5 100644 --- a/megatron/mpu/random.py +++ b/megatron/mpu/random.py @@ -21,7 +21,7 @@ import torch from torch import _C -from torch.cuda import _lazy_call, device as device_ctx_manager +from deepspeed.accelerator import get_accelerator from torch.utils.checkpoint import detach_variable from megatron import get_args @@ -80,25 +80,26 @@ def _set_cuda_rng_state(new_state, device=-1): if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState): # older PyTorch def cb(): - with device_ctx_manager(device): + with get_accelerator().device(device): _C._cuda_setRNGState(new_state) else: # newer PyTorch if device == -1: - device = torch.device('cuda') + device = torch.device(get_accelerator().device_name()) elif isinstance(device, str): device = torch.device(device) elif isinstance(device, int): - device = torch.device('cuda', device) + device = torch.device(get_accelerator().device_name(), device) def cb(): idx = device.index if idx is None: - idx = torch.cuda.current_device() - default_generator = torch.cuda.default_generators[idx] + idx = get_accelerator().current_device() + + default_generator = get_accelerator().default_generator(idx) default_generator.set_state(new_state) - _lazy_call(cb) + get_accelerator().lazy_call(cb) def split_tensor_into_1d_equal_chunks(tensor): @@ -116,7 +117,7 @@ def gather_split_1d_tensor(tensor): numel = torch.numel(tensor) numel_gathered = world_size * numel gathered = torch.empty(numel_gathered, dtype=tensor.dtype, - device=torch.cuda.current_device(), + device=get_accelerator().current_device_name(), requires_grad=False) chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)] torch.distributed.all_gather(chunks, tensor, @@ -167,10 +168,10 @@ def add(self, name, seed): if name in self.states_: raise Exception('cuda rng state {} already exists'.format(name)) # Get the current rng state. - orig_rng_state = torch.cuda.get_rng_state() + orig_rng_state = get_accelerator().get_rng_state() # Set the new state and store it. - torch.cuda.manual_seed(seed) - self.states_[name] = torch.cuda.get_rng_state() + get_accelerator().manual_seed(seed) + self.states_[name] = get_accelerator().get_rng_state() # Reset rng state to what it was. _set_cuda_rng_state(orig_rng_state) @@ -183,7 +184,7 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): print(name, self.states_) raise Exception('cuda rng state {} is not added'.format(name)) # Store current rng state. - orig_cuda_rng_state = torch.cuda.get_rng_state() + orig_cuda_rng_state = get_accelerator().get_rng_state() # Set rng state to the desired one _set_cuda_rng_state(self.states_[name]) # Do the stuff we wanted to do. @@ -191,7 +192,7 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): yield finally: # Update the current rng state for later use. - self.states_[name] = torch.cuda.get_rng_state() + self.states_[name] = get_accelerator().get_rng_state() # And set the state to the original state we started with. _set_cuda_rng_state(orig_cuda_rng_state) @@ -237,7 +238,7 @@ def model_parallel_cuda_manual_seed(seed): data_parallel_seed), flush=True) _CUDA_RNG_STATE_TRACKER.reset() # Set the default state. - torch.cuda.manual_seed(data_parallel_seed) + get_accelerator().manual_seed(data_parallel_seed) # and model parallel state. _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, tensor_model_parallel_seed) @@ -256,7 +257,7 @@ def forward(ctx, run_function, *args): # Copy the rng states. ctx.fwd_cpu_rng_state = torch.get_rng_state() - ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state() + ctx.fwd_cuda_rng_state = get_accelerator().get_rng_state() ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() with torch.no_grad(): @@ -288,7 +289,7 @@ def backward(ctx, *args): # Store the current states. bwd_cpu_rng_state = torch.get_rng_state() - bwd_cuda_rng_state = torch.cuda.get_rng_state() + bwd_cuda_rng_state = get_accelerator().get_rng_state() bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states() # Set the states to what it used to be before the forward pass. @@ -309,7 +310,7 @@ def backward(ctx, *args): if isinstance(outputs, torch.Tensor): outputs = (outputs,) elif len(outputs) == 2 and isinstance(outputs[1], torch.Tensor) and \ - torch.equal(outputs[1], torch.tensor(0).cuda()): + torch.equal(outputs[1], torch.tensor(0).to(get_accelerator().device_name())): # a hacky solution to overcome issue when running old script examples/pretrain_gpt_distributed.sh outputs = (outputs[0],) torch.autograd.backward(outputs, args) diff --git a/megatron/mpu/tests/commons.py b/megatron/mpu/tests/commons.py index 5e7a18672..432e720a0 100644 --- a/megatron/mpu/tests/commons.py +++ b/megatron/mpu/tests/commons.py @@ -20,7 +20,7 @@ import torch import mpu - +from deepspeed.accelerator import get_accelerator class IdentityLayer(torch.nn.Module): def __init__(self, size, scale=1.0): @@ -56,10 +56,10 @@ def initialize_distributed(backend='nccl'): 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) # Set the device id. - device = rank % torch.cuda.device_count() + device = rank % get_accelerator().device_count() if local_rank is not None: device = local_rank - torch.cuda.set_device(device) + get_accelerator().set_device(device) # Call the init process. init_method = 'tcp://' diff --git a/megatron/mpu/tests/test_cross_entropy.py b/megatron/mpu/tests/test_cross_entropy.py index 46d7ba981..8155e3645 100644 --- a/megatron/mpu/tests/test_cross_entropy.py +++ b/megatron/mpu/tests/test_cross_entropy.py @@ -23,6 +23,7 @@ import torch import random import sys +from deepspeed.accelerator import get_accelerator sys.path.append("../..") @@ -30,9 +31,9 @@ def torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed): set_random_seed(seed) identity = IdentityLayer((batch_size, seq_length, vocab_size), - scale=logits_scale).cuda() + scale=logits_scale).to(get_accelerator().device_name()) logits = identity() - target = torch.cuda.LongTensor( + target = get_accelerator().LongTensor( size=(batch_size, seq_length)).random_(0, vocab_size) loss = F.cross_entropy(logits.view(-1, logits.size()[-1]), target.view(-1), @@ -45,10 +46,10 @@ def mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed): set_random_seed(seed) identity = IdentityLayer((batch_size, seq_length, vocab_size), - scale=logits_scale).cuda() + scale=logits_scale).to(get_accelerator().device_name()) logits = identity() logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits) - target = torch.cuda.LongTensor( + target = get_accelerator().LongTensor( size=(batch_size, seq_length)).random_(0, vocab_size) loss = vocab_parallel_cross_entropy(logits_parallel, target).mean() loss.backward() diff --git a/megatron/mpu/tests/test_data.py b/megatron/mpu/tests/test_data.py index ae3627703..630b00900 100644 --- a/megatron/mpu/tests/test_data.py +++ b/megatron/mpu/tests/test_data.py @@ -15,6 +15,7 @@ from commons import print_separator from commons import initialize_distributed +from deepspeed.accelerator import get_accelerator from mpu import data as data_utils import mpu import torch @@ -65,7 +66,7 @@ def test_broadcast_data(tensor_model_parallel_size): data_b = data_utils.broadcast_data(keys, data, torch.int64) for key in keys: - tensor = data_t[key].cuda() + tensor = data_t[key].to(get_accelerator().device_name()) assert data_b[key].sub(tensor).abs().max() == 0 # Reset groups diff --git a/megatron/mpu/tests/test_layers.py b/megatron/mpu/tests/test_layers.py index b12f48509..c5e3a2afc 100644 --- a/megatron/mpu/tests/test_layers.py +++ b/megatron/mpu/tests/test_layers.py @@ -25,7 +25,7 @@ import sys sys.path.append("../..") - +device_name = get_accelerator().device_name() def test_parallel_embedding(tensor_model_parallel_size): if torch.distributed.get_rank() == 0: @@ -43,11 +43,11 @@ def test_parallel_embedding(tensor_model_parallel_size): set_random_seed(123) input_data = torch.LongTensor( - size=(batch_size, seq_length)).random_(0, vocab_size).cuda() - loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda() + size=(batch_size, seq_length)).random_(0, vocab_size).to(device_name) + loss_weight = torch.randn([batch_size, seq_length, hidden_size]).to(device_name) set_random_seed(seed) - embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda() + embedding_original = torch.nn.Embedding(vocab_size, hidden_size).to(device_name) output = embedding_original(input_data) loss_original = torch.mul(output, loss_weight).sum() @@ -55,14 +55,14 @@ def test_parallel_embedding(tensor_model_parallel_size): set_random_seed(seed) embedding_parallel = layers.ParallelEmbedding( - vocab_size, hidden_size, init_method=init.normal_).cuda() + vocab_size, hidden_size, init_method=init.normal_).to(device_name) output = embedding_parallel(input_data) loss_parallel = torch.mul(output, loss_weight).sum() loss_parallel.backward() set_random_seed(seed) embedding_vocab_parallel = layers.VocabParallelEmbedding( - vocab_size, hidden_size, init_method=init.normal_).cuda() + vocab_size, hidden_size, init_method=init.normal_).to(device_name) output = embedding_vocab_parallel(input_data) loss_vocab_parallel = torch.mul(output, loss_weight).sum() loss_vocab_parallel.backward() @@ -200,10 +200,10 @@ def test_column_parallel_linear(tensor_model_parallel_size): batch_size = 7 # Network - identity_layer = IdentityLayer2D(batch_size, input_size).cuda() + identity_layer = IdentityLayer2D(batch_size, input_size).to(device_name) linear_layer = mpu.ColumnParallelLinear( - input_size, output_size, keep_master_weight_for_test=True).cuda() - loss_weight = torch.randn([batch_size, output_size]).cuda() + input_size, output_size, keep_master_weight_for_test=True).to(device_name) + loss_weight = torch.randn([batch_size, output_size]).to(device_name) # Forward input_ = identity_layer() output = linear_layer(input_) @@ -214,9 +214,9 @@ def test_column_parallel_linear(tensor_model_parallel_size): # Values. dLdY = loss_weight X = identity_layer.weight - A = linear_layer.master_weight.cuda() + A = linear_layer.master_weight.to(device_name) dLdA = torch.matmul(dLdY.t(), X) - dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) + dLdb = torch.matmul(torch.ones(batch_size, 1).to(device_name).t(), dLdY).view(-1) dLdX = torch.matmul(dLdY, A) rank = mpu.get_tensor_model_parallel_rank() @@ -267,10 +267,10 @@ def test_row_parallel_linear(tensor_model_parallel_size): batch_size = 7 # Network - identity_layer = IdentityLayer2D(batch_size, input_size).cuda() + identity_layer = IdentityLayer2D(batch_size, input_size).to(device_name) linear_layer = mpu.RowParallelLinear( - input_size, output_size, keep_master_weight_for_test=True).cuda() - loss_weight = torch.randn([batch_size, output_size]).cuda() + input_size, output_size, keep_master_weight_for_test=True).to(device_name) + loss_weight = torch.randn([batch_size, output_size]).to(device_name) # Forward input_ = identity_layer() output = linear_layer(input_) @@ -281,9 +281,9 @@ def test_row_parallel_linear(tensor_model_parallel_size): # Values. dLdY = loss_weight X = identity_layer.weight - A = linear_layer.master_weight.cuda() + A = linear_layer.master_weight.to(device_name) dLdA = torch.matmul(dLdY.t(), X) - dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1) + dLdb = torch.matmul(torch.ones(batch_size, 1).to(device_name).t(), dLdY).view(-1) dLdX = torch.matmul(dLdY, A) rank = mpu.get_tensor_model_parallel_rank() @@ -340,11 +340,11 @@ def parallel_self_attention(tensor_model_parallel_size, num_att_heads_per_partit # Network identity_layer = IdentityLayer3D(batch_size, sequence_length, - hidden_size).cuda() + hidden_size).to(device_name) attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads, - dropout_prob).cuda() - loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() - attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() + dropout_prob).to(device_name) + loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).to(device_name) + attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).to(device_name) # Forward input_ = identity_layer() output = attention_layer(input_, attention_mask) @@ -426,13 +426,13 @@ def parallel_transformer(tensor_model_parallel_size, num_att_heads_per_partition # Network identity_layer = IdentityLayer3D(batch_size, sequence_length, - hidden_size).cuda() + hidden_size).to(device_name) transformer_layer = mpu.BertParallelTransformerLayer( hidden_size, intermediate_size, num_att_heads, 0.0, 0.0, - torch.nn.functional.relu, 1.0e-5).cuda() + torch.nn.functional.relu, 1.0e-5).to(device_name) - loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda() - attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda() + loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).to(device_name) + attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).to(device_name) # Forward input_ = identity_layer() output = transformer_layer(input_, attention_mask) diff --git a/megatron/mpu/tests/test_random.py b/megatron/mpu/tests/test_random.py index 9c9c50341..92ec14d2e 100644 --- a/megatron/mpu/tests/test_random.py +++ b/megatron/mpu/tests/test_random.py @@ -32,11 +32,11 @@ def test_set_cuda_rng_state(tensor_model_parallel_size): size = 123 seed = 1234 - torch.cuda.manual_seed(1234) - tensor = torch.cuda.FloatTensor(size) + get_accelerator().manual_seed(1234) + tensor = get_accelerator().FloatTensor(size) # Get the state - rng_state = torch.cuda.get_rng_state() + rng_state = get_accelerator().get_rng_state() rng_state_copy = rng_state.clone() # Do some stuff. @@ -45,10 +45,10 @@ def test_set_cuda_rng_state(tensor_model_parallel_size): result_1 = tensor.clone() assert rng_state.sub(rng_state_copy).max() == 0 - assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0 + assert get_accelerator().get_rng_state().sub(rng_state_copy).max() > 0 # State should be different. - new_rng_state = torch.cuda.get_rng_state() + new_rng_state = get_accelerator().get_rng_state() max_diff = new_rng_state.sub(rng_state).max() print(' max diff in rng state (should be non-zero) on global rank {}: {}'. format(torch.distributed.get_rank(), max_diff)) @@ -95,17 +95,17 @@ def test_cuda_rng_tracker(tensor_model_parallel_size): seed_1 = 1234 seed_2 = 4321 size = [12, 21] - tensor = torch.cuda.FloatTensor(size) + tensor = get_accelerator().FloatTensor(size) # Set to seed_1 and generate two tensors. - torch.cuda.manual_seed(seed_1) + get_accelerator().manual_seed(seed_1) torch.randn(size, out=tensor) target_11 = tensor.clone() torch.randn(size, out=tensor) target_12 = tensor.clone() # Set to seed_2 and generate two tensors. - torch.cuda.manual_seed(seed_2) + get_accelerator().manual_seed(seed_2) torch.randn(size, out=tensor) target_21 = tensor.clone() torch.randn(size, out=tensor) @@ -113,7 +113,7 @@ def test_cuda_rng_tracker(tensor_model_parallel_size): # Now if we interleave seed_1 and seed_2, # we should still get the same tensors - torch.cuda.manual_seed(seed_1) + get_accelerator().manual_seed(seed_1) mpu.get_cuda_rng_tracker().add('test', seed_2) torch.randn(size, out=tensor) @@ -164,9 +164,9 @@ def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size): tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() mpu.model_parallel_cuda_manual_seed(12345) - assert torch.cuda.initial_seed() == 12345 + assert get_accelerator().initial_seed() == 12345 with mpu.get_cuda_rng_tracker().fork(): - assert torch.cuda.initial_seed() == (12345 + 2718 + + assert get_accelerator().initial_seed() == (12345 + 2718 + mpu.get_tensor_model_parallel_rank()) # Reset the tracker diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 68307423c..659d680be 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -12,9 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from deepspeed.accelerator import get_accelerator +if get_accelerator().device_name() == 'cuda': + from apex.optimizers import FusedAdam as Adam + from apex.optimizers import FusedSGD as SGD +else: + from torch.optim import Adam + from torch.optim import SGD -from apex.optimizers import FusedAdam as Adam -from apex.optimizers import FusedSGD as SGD from megatron import get_args from megatron.model import LayerNorm diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py index 036a1d4c4..1f26f3868 100644 --- a/megatron/optimizer/clip_grads.py +++ b/megatron/optimizer/clip_grads.py @@ -17,14 +17,17 @@ import torch from torch._six import inf +from deepspeed.accelerator import get_accelerator + +if get_accelerator().device_name() == 'cuda': + from apex.multi_tensor_apply import multi_tensor_applier + import amp_C -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C from megatron import mpu from megatron.model.module import param_is_not_shared from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate - +from deepspeed.accelerator import get_accelerator def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): """Clips gradient norm of an iterable of parameters whose gradients @@ -61,7 +64,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): grad = param.grad.detach() if grad_not_none: # Make sure the grads are in fp32 - assert param.grad.type() == 'torch.cuda.FloatTensor' + assert param.grad.type() == 'torch.{}.FloatTensor'.format(get_accelerator().device_name()) grads.append(grad) if grad_not_none and is_not_shared and is_not_tp_duplicate: grads_for_norm.append(grad) @@ -74,7 +77,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Calculate norm. if norm_type == inf: total_norm = max(grad.abs().max() for grad in grads_for_norm) - total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) + total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)]) # Take max across all model-parallel GPUs. torch.distributed.all_reduce(total_norm_cuda, op=torch.distributed.ReduceOp.MAX, @@ -83,20 +86,22 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): else: if norm_type == 2.0: - dummy_overflow_buf = torch.cuda.IntTensor([0]) - # Use apex's multi-tensor applier for efficiency reasons. - # Multi-tensor applier takes a function and a list of list - # and performs the operation on that list all in one kernel. - grad_norm, _ = multi_tensor_applier( - amp_C.multi_tensor_l2norm, - dummy_overflow_buf, - [grads_for_norm], - False # no per-parameter norm - ) + if get_accelerator().device_name() == 'cuda': + dummy_overflow_buf = get_accelerator().IntTensor([0]) + # Use apex's multi-tensor applier for efficiency reasons. + # Multi-tensor applier takes a function and a list of list + # and performs the operation on that list all in one kernel. + grad_norm, _ = multi_tensor_applier( + amp_C.multi_tensor_l2norm, + dummy_overflow_buf, + [grads_for_norm], + False # no per-parameter norm + ) + else: + grad_norm = torch.norm(grads_for_norm,p=2.0) # Since we will be summing across data parallel groups, # we need the pow(norm-type). total_norm = grad_norm ** norm_type - else: for grad in grads_for_norm: grad_norm = torch.norm(grad, norm_type) @@ -111,11 +116,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): # Scale. clip_coeff = max_norm / (total_norm + 1.0e-6) if clip_coeff < 1.0: - dummy_overflow_buf = torch.cuda.IntTensor([0]) - multi_tensor_applier(amp_C.multi_tensor_scale, - dummy_overflow_buf, - [grads, grads], - clip_coeff) + if get_accelerator().device_name() == 'cuda': + dummy_overflow_buf = get_accelerator().IntTensor([0]) + multi_tensor_applier(amp_C.multi_tensor_scale, + dummy_overflow_buf, + [grads, grads], + clip_coeff) + else: + for g in grads: + g.detach().mul_(clip_coeff.to(g.device)) return total_norm diff --git a/megatron/optimizer/grad_scaler.py b/megatron/optimizer/grad_scaler.py index 6b25588db..bc2897f86 100644 --- a/megatron/optimizer/grad_scaler.py +++ b/megatron/optimizer/grad_scaler.py @@ -19,14 +19,14 @@ from abc import abstractmethod import torch - +from deepspeed.accelerator import get_accelerator class MegatronGradScaler(ABC): def __init__(self, initial_scale): """Initialize scale value with the input initial scale.""" assert initial_scale > 0.0 - self._scale = torch.cuda.FloatTensor([initial_scale]) + self._scale = get_accelerator().FloatTensor([initial_scale]) @property def scale(self): @@ -75,13 +75,13 @@ def __init__(self, initial_scale, min_scale, # Lower bound on the scale. assert min_scale > 0.0 assert min_scale <= initial_scale - self.min_scale = torch.cuda.FloatTensor([min_scale]) + self.min_scale = get_accelerator().FloatTensor([min_scale]) # Growth and backoff factors for the scale. assert growth_factor > 1.0 - self.growth_factor = torch.cuda.FloatTensor([growth_factor]) + self.growth_factor = get_accelerator().FloatTensor([growth_factor]) assert backoff_factor < 1.0 assert backoff_factor > 0.0 - self.backoff_factor = torch.cuda.FloatTensor([backoff_factor]) + self.backoff_factor = get_accelerator().FloatTensor([backoff_factor]) # Interval over which if we don't see any inf/nan, # we will scale the grad scale by the growth factor. assert growth_interval > 0 @@ -128,6 +128,6 @@ def state_dict(self): def load_state_dict(self, state_dict): - self._scale = state_dict['scale'].cuda(torch.cuda.current_device()) + self._scale = state_dict['scale'].to(get_accelerator().current_device_name()) self._growth_tracker = state_dict['growth_tracker'] self._hysteresis_tracker = state_dict['hysteresis_tracker'] diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py index 77baddd62..6568bf10a 100644 --- a/megatron/optimizer/optimizer.py +++ b/megatron/optimizer/optimizer.py @@ -20,13 +20,11 @@ import torch -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C from megatron import get_timers from megatron import mpu from megatron import print_rank_0 - +from deepspeed.accelerator import get_accelerator from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32 @@ -50,7 +48,10 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None): We don't have a blfoat16 implementation so for now if the overflow_buf is not provided, we default back to simple loop copy to be compatible with bfloat16.""" - if overflow_buf: + if get_accelerator().device_name() == 'cuda' and overflow_buf: + from apex.multi_tensor_apply import multi_tensor_applier + import amp_C + overflow_buf.fill_(0) # Scaling with factor `1.0` is equivalent to copy. multi_tensor_applier(amp_C.multi_tensor_scale, @@ -204,7 +205,7 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, # Note that we keep this for the cases that grad scaler is none. # We still record nan/inf if we have a bfloat16 with a grad scaler. if self.grad_scaler: - self.found_inf = torch.cuda.FloatTensor([0.0]) + self.found_inf = get_accelerator().FloatTensor([0.0]) # Dummy tensor needed for apex multi-apply tensor. # For bfloat, we don't have multi-tensor apply and for now @@ -212,11 +213,11 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, if bf16: self._dummy_overflow_buf = None else: - self._dummy_overflow_buf = torch.cuda.IntTensor([0]) + self._dummy_overflow_buf = get_accelerator().IntTensor([0]) # In case grad scaler is not passed, define the unity scale. if self.grad_scaler is None: - self._scale_one = torch.cuda.FloatTensor([1.0]) + self._scale_one = get_accelerator().FloatTensor([1.0]) # ====================== # main parameter stuff @@ -240,8 +241,10 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, if param.requires_grad: # float16 params: - if param.type() in ['torch.cuda.HalfTensor', - 'torch.cuda.BFloat16Tensor']: + + + if param.type() in ['torch.{}.HalfTensor'.format(get_accelerator().device_name()), + 'torch.{}.BFloat16Tensor'.format(get_accelerator().device_name())]: float16_params_this_group.append(param) # Create a copy main_param = param.detach().clone().float() @@ -259,16 +262,17 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad, = self.optimizer.state.pop(param) # fp32 params. - elif param.type() == 'torch.cuda.FloatTensor': + elif param.type() == 'torch.{}.FloatTensor'.format(format(get_accelerator().device_name())): fp32_params_this_group.append(param) param_group['params'][i] = param else: + device_name = get_accelerator().device_name() raise TypeError('Wrapped parameters must be one of ' - 'torch.cuda.FloatTensor, ' - 'torch.cuda.HalfTensor, or ' - 'torch.cuda.BFloat16Tensor. ' - 'Received {}'.format(param.type())) + 'torch.{}.FloatTensor, ' + 'torch.{}.HalfTensor, or ' + 'torch.{}.BFloat16Tensor. ' + 'Received {}'.format(device_name,device_name,device_name,param.type())) self.float16_groups.append(float16_params_this_group) self.fp32_from_float16_groups.append( @@ -470,7 +474,7 @@ def __init__(self, optimizer, clip_grad, optimizer, clip_grad, log_num_zeros_in_grad, params_have_main_grad) - self._scale = torch.cuda.FloatTensor([1.0]) + self._scale = get_accelerator().FloatTensor([1.0]) def zero_grad(self, set_to_none=True): diff --git a/megatron/p2p_communication.py b/megatron/p2p_communication.py index 863a60b0a..21df8b2b3 100644 --- a/megatron/p2p_communication.py +++ b/megatron/p2p_communication.py @@ -16,7 +16,7 @@ from functools import reduce import operator import torch - +from deepspeed.accelerator import get_accelerator from megatron import get_args from megatron import mpu @@ -59,12 +59,12 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next, if recv_prev: tensor_recv_prev = torch.empty(tensor_chunk_shape, requires_grad=True, - device=torch.cuda.current_device(), + device=get_accelerator().current_device_name(), dtype=dtype) if recv_next: tensor_recv_next = torch.empty(tensor_chunk_shape, requires_grad=True, - device=torch.cuda.current_device(), + device=get_accelerator().current_device_name(), dtype=dtype) # Split tensor into smaller chunks if using scatter-gather optimization. @@ -109,7 +109,7 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next, for req in reqs: req.wait() # To protect against race condition when using batch_isend_irecv(). - torch.cuda.synchronize() + get_accelerator().synchronize() # If using scatter-gather optimization, gather smaller chunks. if args.scatter_gather_tensors_in_pipeline: diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py index adf04bcb4..95c013e44 100644 --- a/megatron/text_generation_utils.py +++ b/megatron/text_generation_utils.py @@ -22,7 +22,6 @@ import torch import torch.nn.functional as F - from megatron import get_args from megatron import get_tokenizer from megatron import mpu @@ -33,14 +32,14 @@ from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron.model import DistributedDataParallel as LocalDDP from megatron.model import Float16Module - +from deepspeed.accelerator import get_accelerator def get_batch(context_tokens): """Generate batch from context tokens.""" args = get_args() tokenizer = get_tokenizer() # Move to GPU. - tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().cuda() + tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().to(get_accelerator().device_name()) # Get the attention mask and postition ids. attention_mask, _, position_ids = get_ltor_masks_and_position_ids( tokens, @@ -137,7 +136,7 @@ def generate_samples_input_from_file(model): context_length = 0 input_info = [terminate_runs, raw_text_len, context_length] - input_info_tensor = torch.cuda.LongTensor(input_info) + input_info_tensor = get_accelerator().LongTensor(input_info) torch.distributed.all_reduce(input_info_tensor, group=mpu.get_model_parallel_group()) terminate_runs = input_info_tensor[0].item() @@ -154,14 +153,14 @@ def generate_samples_input_from_file(model): if mpu.is_pipeline_first_stage(): src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() - context_tokens_tensor = torch.cuda.LongTensor(context_tokens) + context_tokens_tensor = get_accelerator().LongTensor(context_tokens) torch.distributed.broadcast(context_tokens_tensor, src, group) else: src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.empty(context_length, dtype=torch.int64, - device=torch.device("cuda")) + device=get_accelerator().current_device_name()) torch.distributed.broadcast(context_tokens_tensor, src, group) context_tokens = context_tokens_tensor.cpu().numpy().tolist() @@ -259,7 +258,7 @@ def generate_samples_interactive(model, print_frequency=24): context_length = 0 input_info = [terminate_runs, raw_text_len, context_length] - input_info_tensor = torch.cuda.LongTensor(input_info) + input_info_tensor = get_accelerator().LongTensor(input_info) torch.distributed.all_reduce(input_info_tensor, group=mpu.get_model_parallel_group()) terminate_runs = input_info_tensor[0].item() @@ -276,14 +275,14 @@ def generate_samples_interactive(model, print_frequency=24): if mpu.is_pipeline_first_stage(): src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() - context_tokens_tensor = torch.cuda.LongTensor(context_tokens) + context_tokens_tensor = get_accelerator().LongTensor(context_tokens) torch.distributed.broadcast(context_tokens_tensor, src, group) else: src = mpu.get_pipeline_model_parallel_first_rank() group = mpu.get_pipeline_model_parallel_group() context_tokens_tensor = torch.empty(context_length, dtype=torch.int64, - device=torch.device("cuda")) + device=torch.device(get_accelerator().device_name())) torch.distributed.broadcast(context_tokens_tensor, src, group) context_tokens = context_tokens_tensor.cpu().numpy().tolist() @@ -333,12 +332,12 @@ def generate_samples_unconditional(model, latencies=[], model_latencies=[], sing for _ in range(args.micro_batch_size)] ctr = 0 while True: - torch.cuda.synchronize() + get_accelerator().synchronize() start_time = time.time() for token_stream in get_token_stream(model, copy.deepcopy(context_tokens), model_latencies=model_latencies, single_token_latency=single_token_latency): pass - torch.cuda.synchronize() + get_accelerator().synchronize() latencies.append(time.time() - start_time) start_time = time.time() if mpu.is_pipeline_last_stage() and \ @@ -400,8 +399,8 @@ def get_token_stream(model, context_tokens, model_latencies=[], single_token_lat context_tokens, context_lengths = pad_batch(context_tokens, tokenizer.eod, args) - context_tokens_tensor = torch.cuda.LongTensor(context_tokens) - context_length_tensor = torch.cuda.LongTensor(context_lengths) + context_tokens_tensor = get_accelerator().LongTensor(context_tokens) + context_length_tensor = get_accelerator().LongTensor(context_lengths) torch.distributed.broadcast(context_length_tensor, mpu.get_tensor_model_parallel_src_rank(), @@ -422,10 +421,10 @@ def get_token_stream(model, context_tokens, model_latencies=[], single_token_lat t0=time.time() for tokens, lengths in batch_token_iterator: if count > 1: - torch.cuda.synchronize() + get_accelerator().synchronize() t_elapsed = time.time() - t0 single_token_latency.append(t_elapsed) - torch.cuda.synchronize() + get_accelerator().synchronize() t0=time.time() count+=1 context_length += 1 @@ -447,7 +446,7 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids, # Hidden size changes when not using recompute, need to tell p2p_communicate # functions the correct size - torch.cuda.synchronize() + get_accelerator().synchronize() t0 = time.time() args = get_args() orig_seq_length = args.seq_length @@ -476,7 +475,7 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids, send_forward(output_tensor) args.seq_length = orig_seq_length - torch.cuda.synchronize() + get_accelerator().synchronize() model_latencies.append(time.time()-t0) if get_key_value: return output_tensor, layer_past @@ -506,14 +505,14 @@ def sample_sequence_batch(model, context_tokens, context_lengths, layer_past = None batch_size = context_tokens.size(0) - is_done = torch.zeros([batch_size]).byte().cuda() + is_done = torch.zeros([batch_size]).byte().to(get_accelerator().device_name()) tokens = context_tokens if maxlen is None: maxlen = args.seq_length - 1 if maxlen > (org_context_length + args.out_seq_length): maxlen = org_context_length + args.out_seq_length - lengths = torch.ones([batch_size]).long().cuda() * maxlen + lengths = torch.ones([batch_size]).long().to(get_accelerator().device_name()) * maxlen while context_length <= (maxlen): if args.recompute: @@ -593,7 +592,7 @@ def sample_sequence_batch(model, context_tokens, context_lengths, else: yield None, None - done = torch.cuda.ByteTensor([0]) + done = get_accelerator().ByteTensor([0]) src = mpu.get_pipeline_model_parallel_last_rank() group = mpu.get_pipeline_model_parallel_group() torch.distributed.broadcast(done, src, group) diff --git a/megatron/training.py b/megatron/training.py index 92d4ba2a8..adf84699f 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -52,7 +52,7 @@ from megatron.schedules import forward_backward_pipelining_without_interleaving from megatron.schedules import forward_backward_pipelining_with_interleaving from megatron.utils import report_memory, throughput_calculator, checkpoint_throughput_calculator - +from deepspeed.accelerator import get_accelerator import deepspeed from deepspeed.compression.compress import init_compression, redundancy_clean @@ -105,7 +105,7 @@ def pretrain(train_valid_test_dataset_provider, # This will be closer to what scheduler will see (outside of # image ... launches. global _TRAIN_START_TIME - start_time_tensor = torch.cuda.FloatTensor([_TRAIN_START_TIME]) + start_time_tensor = get_accelerator().FloatTensor([_TRAIN_START_TIME]) torch.distributed.all_reduce(start_time_tensor, op=torch.distributed.ReduceOp.MIN) _TRAIN_START_TIME = start_time_tensor.item() @@ -326,14 +326,15 @@ def get_model(model_provider_func): # GPU allocation. for model_module in model: - model_module.cuda(torch.cuda.current_device()) + model_module.to(get_accelerator().current_device_name()) + # Fp16 conversion. if args.fp16 or args.bf16: model = [Float16Module(model_module, args) for model_module in model] if args.DDP_impl == 'torch': - i = torch.cuda.current_device() + i = get_accelerator().current_device() model = [torchDDP(model_module, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group()) for model_module in model] @@ -712,7 +713,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, for key in loss_dict: if not skipped_iter: total_loss_dict[key] = total_loss_dict.get( - key, torch.cuda.FloatTensor([0.0])) + loss_dict[key] + key, get_accelerator().FloatTensor([0.0])) + loss_dict[key] else: value = loss_dict[key].float().sum().item() is_nan = value == float('inf') or \ @@ -848,23 +849,23 @@ def add_to_logging(name): # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats)) if args.zero_stage > 0: # ZeRO partiions optimizer states - opt_stats = torch.cuda.FloatTensor(opt_stats) + opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_data_parallel_group()) - opt_stats_2 = torch.cuda.FloatTensor(opt_stats_2) + opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_data_parallel_group()) if args.tensor_model_parallel_size > 1: - opt_stats = torch.cuda.FloatTensor(opt_stats) + opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_tensor_model_parallel_group()) - opt_stats_2 = torch.cuda.FloatTensor(opt_stats_2) + opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_tensor_model_parallel_group()) if args.pipeline_model_parallel_size > 1: - opt_stats = torch.cuda.FloatTensor(opt_stats) + opt_stats = get_accelerator().FloatTensor(opt_stats) torch.distributed.all_reduce(opt_stats, group=mpu.get_pipeline_model_parallel_group()) - opt_stats_2 = torch.cuda.FloatTensor(opt_stats_2) + opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2) torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX, group=mpu.get_pipeline_model_parallel_group()) @@ -939,7 +940,7 @@ def add_to_logging(name): float(max(1, total_loss_dict[advanced_iters_key])) if avg > 0.0: log_string += ' {}: {:.6E} |'.format(key, avg) - total_loss_dict[key] = torch.cuda.FloatTensor([0.0]) + total_loss_dict[key] = get_accelerator().FloatTensor([0.0]) log_string += ' loss scale: {:.1f} |'.format(loss_scale) if grad_norm is not None: log_string += ' grad norm: {:.3f} |'.format(grad_norm) @@ -1103,7 +1104,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler, # Exiting based on duration if args.exit_duration_in_mins: train_time = (time.time() - _TRAIN_START_TIME) / 60.0 - done_cuda = torch.cuda.IntTensor( + done_cuda = get_accelerator().IntTensor( [train_time > args.exit_duration_in_mins]) torch.distributed.all_reduce( done_cuda, op=torch.distributed.ReduceOp.MAX) @@ -1180,7 +1181,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False): for key in loss_dict: if 'moe' not in key: total_loss_dict[key] = total_loss_dict.get( - key, torch.cuda.FloatTensor([0.0])) + loss_dict[key] + key, get_accelerator().FloatTensor([0.0])) + loss_dict[key] args.consumed_valid_samples += mpu.get_data_parallel_world_size() \ * args.micro_batch_size \ @@ -1299,10 +1300,10 @@ def build_train_valid_test_data_iterators( do_valid = valid_dataloader is not None and args.eval_iters > 0 do_test = test_dataloader is not None and args.eval_iters > 0 # Need to broadcast num_tokens and num_type_tokens. - flags = torch.cuda.LongTensor( + flags = get_accelerator().LongTensor( [int(do_train), int(do_valid), int(do_test)]) else: - flags = torch.cuda.LongTensor([0, 0, 0]) + flags = get_accelerator().LongTensor([0, 0, 0]) # Broadcast num tokens. torch.distributed.broadcast(flags, diff --git a/megatron/utils.py b/megatron/utils.py index 99c9438bc..6657a289d 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -20,8 +20,10 @@ import torch from torch.nn.parallel import DistributedDataParallel as torchDDP -from apex.multi_tensor_apply import multi_tensor_applier -import amp_C +from deepspeed.accelerator import get_accelerator +if get_accelerator().device_name == 'cuda': + from apex.multi_tensor_apply import multi_tensor_applier + import amp_C from megatron import get_args from megatron import print_rank_0 @@ -30,7 +32,7 @@ from megatron.model.module import param_is_not_shared from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate from megatron import get_num_microbatches - +from deepspeed.accelerator import get_accelerator def unwrap_model(model, module_instances=(torchDDP)): return_list = True if not isinstance(model, list): @@ -63,13 +65,18 @@ def calc_params_l2_norm(model): else: params_data.append(param.data) # Calculate norm - dummy_overflow_buf = torch.cuda.IntTensor([0]) - norm, _ = multi_tensor_applier( - amp_C.multi_tensor_l2norm, - dummy_overflow_buf, - [params_data], - False # no per-parameter norm - ) + dummy_overflow_buf = get_accelerator().IntTensor([0]) + + if get_accelerator().device_name == 'cuda': + + norm, _ = multi_tensor_applier( + amp_C.multi_tensor_l2norm, + dummy_overflow_buf, + [params_data], + False # no per-parameter norm + ) + else : + norm = torch.norm(params_data,p=2.0) norm_2 = norm * norm # Sum across all model-parallel GPUs. torch.distributed.all_reduce(norm_2, @@ -95,13 +102,13 @@ def report_memory(name): mega_bytes = 1024.0 * 1024.0 string = name + ' memory (MB)' string += ' | allocated: {}'.format( - torch.cuda.memory_allocated() / mega_bytes) + get_accelerator().memory_allocated() / mega_bytes) string += ' | max allocated: {}'.format( - torch.cuda.max_memory_allocated() / mega_bytes) + get_accelerator().max_memory_allocated() / mega_bytes) string += ' | reserved: {}'.format( - torch.cuda.memory_reserved() / mega_bytes) + get_accelerator().memory_reserved() / mega_bytes) string += ' | max reserved: {}'.format( - torch.cuda.max_memory_reserved() / mega_bytes) + get_accelerator().max_memory_reserved() / mega_bytes) if mpu.get_data_parallel_rank() == 0: print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 369152c8d..355c41450 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -31,6 +31,7 @@ import deepspeed from deepspeed.runtime.utils import see_memory_usage +from deepspeed.accelerator.real_accelerator import get_accelerator import os import subprocess @@ -62,7 +63,7 @@ def model_provider(pre_process=True, post_process=True): # pipeline it as an activation during training. The mask is constant, and thus # we can reuse it. attention_mask = torch.tril(torch.ones( - (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view( + (1, args.seq_length, args.seq_length), device=get_accelerator().current_device_name())).view( 1, 1, args.seq_length, args.seq_length) # Convert attention mask to binary: diff --git a/pretrain_ict.py b/pretrain_ict.py index 1438b3d57..5ed02356a 100644 --- a/pretrain_ict.py +++ b/pretrain_ict.py @@ -29,7 +29,7 @@ from megatron.model.biencoder_model import biencoder_model_provider from megatron.training import pretrain from megatron.utils import average_losses_across_data_parallel_group - +from deepspeed.accelerator import get_accelerator def pretrain_ict_model_provider(): args = get_args() @@ -89,8 +89,8 @@ def forward_step(data_iterator, model, input_tensor): timers('batch-generator').stop() # Query and Context Types - query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0) - context_types = torch.cuda.LongTensor(*context_tokens.shape).fill_(0) + query_types = get_accelerator().LongTensor(*query_tokens.shape).fill_(0) + context_types = get_accelerator().LongTensor(*context_tokens.shape).fill_(0) # Forward model. query_logits, context_logits = model(query_tokens, query_mask, @@ -118,12 +118,12 @@ def forward_step(data_iterator, model, input_tensor): k=softmax_scores.shape[1], sorted=True) def topk_accuracy(k): - return torch.cuda.FloatTensor([sum([int(i in sorted_indices[i, :k]) \ + return get_accelerator().FloatTensor([sum([int(i in sorted_indices[i, :k]) \ for i in range(global_batch_size)]) / global_batch_size]) topk_accs = [topk_accuracy(int(k)) for k in args.retriever_report_topk_accuracies] - labels = torch.arange(global_batch_size).long().cuda() + labels = torch.arange(global_batch_size).long().to(get_accelerator().device_name()) loss = F.nll_loss(softmax_scores, labels, reduction='mean') reduced_losses = average_losses_across_data_parallel_group([loss, *topk_accs]) diff --git a/pretrain_vit.py b/pretrain_vit.py index 16ec10439..f086d18e1 100644 --- a/pretrain_vit.py +++ b/pretrain_vit.py @@ -37,8 +37,8 @@ def get_batch(data_iterator): data = next(data_iterator) # only data parallelism; no need for broadcast - images = data[0].cuda() - labels = data[1].cuda() + images = data[0].to(get_accelerator().device_name()) + labels = data[1].to(get_accelerator().device_name()) return images, labels diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index ca28e9ee3..394261e3f 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -55,7 +55,7 @@ def __init__(self, model, tokenizer): self.cache_hook = CacheHook(None) self.is_main = args.rank == 0 self.is_local_main = args.local_rank == 0 - self._device = torch.cuda.current_device() + self._device = get_accelerator().current_device_name() self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1 self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1 self.is_data_parallel = mpu.get_data_parallel_world_size() > 1 diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py index 7549f4a09..6c15732c6 100644 --- a/tasks/eval_utils.py +++ b/tasks/eval_utils.py @@ -27,6 +27,7 @@ from megatron.schedules import get_forward_backward_func from tasks.finetune_utils import build_data_loader from tasks.finetune_utils import process_batch +from deepspeed.accelerator import get_accelerator def accuracy_func_provider(single_dataset_provider): @@ -172,7 +173,7 @@ def correct_answers_forward_step(batch, model): # Reduce. if mpu.is_pipeline_last_stage(): - unreduced = torch.cuda.LongTensor([correct, total]) + unreduced = get_accelerator().LongTensor([correct, total]) torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py index df9210287..fd4e79a98 100644 --- a/tasks/finetune_utils.py +++ b/tasks/finetune_utils.py @@ -32,16 +32,16 @@ from megatron.utils import average_losses_across_data_parallel_group from megatron.utils import calc_params_l2_norm from megatron.utils import check_adlr_autoresume_termination - +from deepspeed.accelerator import get_accelerator def process_batch(batch): """Process batch and produce inputs for the model.""" args = get_args() - tokens = batch['text'].long().cuda().contiguous() - types = batch['types'].long().cuda().contiguous() - labels = batch['label'].long().cuda().contiguous() - attention_mask = batch['padding_mask'].float().cuda().contiguous() + tokens = batch['text'].long().to(get_accelerator().device_name()).contiguous() + types = batch['types'].long().to(get_accelerator().device_name()).contiguous() + labels = batch['label'].long().to(get_accelerator().device_name()).contiguous() + attention_mask = batch['padding_mask'].float().to(get_accelerator().device_name()).contiguous() if args.fp16: attention_mask = attention_mask.half() diff --git a/tasks/orqa/evaluate_utils.py b/tasks/orqa/evaluate_utils.py index ebee03522..986ae1b93 100644 --- a/tasks/orqa/evaluate_utils.py +++ b/tasks/orqa/evaluate_utils.py @@ -25,7 +25,7 @@ from megatron.data.realm_index import OpenRetreivalDataStore, FaissMIPSIndex from megatron.model.biencoder_model import biencoder_model_provider from megatron.training import get_model - +from deepspeed.accelerator import get_accelerator class ORQAEvaluator(object): def __init__(self): args = get_args() @@ -121,7 +121,7 @@ def evaluate(self, qa_data, split): split) local_rank = args.local_rank rank = torch.distributed.get_rank() - device_count = torch.cuda.device_count() + device_count = get_accelerator().device_count() num_nodes = torch.distributed.get_world_size() // device_count node_id = rank // device_count @@ -145,14 +145,14 @@ def evaluate(self, qa_data, split): distance, topkindex = self.mips_index.search_mips_index( all_query_tensor, top_k=args.faiss_topk_retrievals, reconstruct=False) - distance = torch.from_numpy(distance).cuda() - topkindex = torch.LongTensor(topkindex).cuda() + distance = torch.from_numpy(distance).to(get_accelerator().device_name()) + topkindex = torch.LongTensor(topkindex).to(get_accelerator().device_name()) if local_rank != 0: distance = torch.empty(device_count * len(query_tensor), \ - args.faiss_topk_retrievals, dtype=torch.float32).cuda() + args.faiss_topk_retrievals, dtype=torch.float32).to(get_accelerator().device_name()) topkindex = torch.empty(device_count * len(query_tensor), \ - args.faiss_topk_retrievals, dtype=torch.int64).cuda() + args.faiss_topk_retrievals, dtype=torch.int64).to(get_accelerator().device_name()) torch.distributed.broadcast(distance, src=device_start_rank, \ group=group) diff --git a/tasks/orqa/natural_questions/nq.py b/tasks/orqa/natural_questions/nq.py index ca07fe416..5282887ee 100644 --- a/tasks/orqa/natural_questions/nq.py +++ b/tasks/orqa/natural_questions/nq.py @@ -28,6 +28,7 @@ from megatron import print_rank_0, get_args, get_tokenizer, mpu from megatron.data.biencoder_dataset_utils import make_attention_mask +from deepspeed.accelerator import get_accelerator def get_nq_dataset(qa_data, split): args = get_args() @@ -42,10 +43,10 @@ def get_nq_dataset(qa_data, split): def process_nq_batch(batch): - query_tokens = batch['token_ids'].long().cuda() - query_mask = (batch['token_mask'] < 0.5).cuda() - query_types = batch['token_types'].long().cuda() - query_len = batch['seq_len'].long().cuda() + query_tokens = batch['token_ids'].long().to(get_accelerator().device_name()) + query_mask = (batch['token_mask'] < 0.5).to(get_accelerator().device_name()) + query_types = batch['token_types'].long().to(get_accelerator().device_name()) + query_len = batch['seq_len'].long().to(get_accelerator().device_name()) reference = batch['reference'] return query_tokens, query_mask, query_types, query_len, reference diff --git a/tasks/vision/eval_utils.py b/tasks/vision/eval_utils.py index aabc04a15..37024a709 100644 --- a/tasks/vision/eval_utils.py +++ b/tasks/vision/eval_utils.py @@ -23,7 +23,7 @@ from tasks.vision.finetune_utils import build_data_loader from tasks.vision.finetune_utils import process_batch from torchvision import datasets, transforms - +from deepspeed.accelerator import get_accelerator def accuracy_func_provider(): """Provide function that calculates accuracies.""" @@ -86,7 +86,7 @@ def calculate_correct_answers(model, dataloader, epoch): model.train() # Reduce. - unreduced = torch.cuda.LongTensor([correct, total]) + unreduced = get_accelerator().LongTensor([correct, total]) torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group()) # Print on screen. diff --git a/tasks/vision/finetune_utils.py b/tasks/vision/finetune_utils.py index afde4aa89..47744ef1d 100644 --- a/tasks/vision/finetune_utils.py +++ b/tasks/vision/finetune_utils.py @@ -29,12 +29,12 @@ from megatron.training import training_log from megatron.utils import check_adlr_autoresume_termination from megatron.utils import average_losses_across_data_parallel_group - +from deepspeed.accelerator import get_accelerator def process_batch(batch): """Process batch and produce inputs for the model.""" - images = batch[0].cuda().contiguous() - labels = batch[1].cuda().contiguous() + images = batch[0].to(get_accelerator().device_name()).contiguous() + labels = batch[1].to(get_accelerator().device_name()).contiguous() return images, labels diff --git a/tasks/zeroshot_gpt/evaluate.py b/tasks/zeroshot_gpt/evaluate.py index 6366cfb60..4e1e763ce 100644 --- a/tasks/zeroshot_gpt/evaluate.py +++ b/tasks/zeroshot_gpt/evaluate.py @@ -29,7 +29,7 @@ from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model from megatron.p2p_communication import recv_forward, send_forward from tasks.finetune_utils import build_data_loader - +from deepspeed.accelerator import get_accelerator from .datasets import build_dataset # These are needed to unwrap the model, would be nice to put these in megatron.utils if possible? @@ -66,8 +66,8 @@ def process_batch(batch): args = get_args() tokenizer = get_tokenizer() - loss_mask = batch['pad_mask'].long().cuda().contiguous().byte() - tokens_ = batch['text'].long().cuda().contiguous() + loss_mask = batch['pad_mask'].long().to(get_accelerator().device_name()).contiguous().byte() + tokens_ = batch['text'].long().to(get_accelerator().device_name()).contiguous() labels = tokens_[:, 1:].contiguous() tokens = tokens_[:, :-1].contiguous() From 57e64391037e7ffd9005a87ae7f1d3009bfddaa7 Mon Sep 17 00:00:00 2001 From: Conglong Li Date: Tue, 21 Feb 2023 10:38:09 -0800 Subject: [PATCH 11/11] data efficiency example update (#113) --- examples/data_efficiency/README.md | 2 +- .../ds_config_bert_TEMPLATE.json | 24 +++ .../finetune_glue/ds_finetune_bert_glue.sh | 156 ++++++++++++++ .../ds_finetune_bert_glue_run.sh | 44 ++++ .../ds_finetune_gather_result.py | 118 +++++++++++ .../pretrain/ds_pretrain_bert_336M_run.sh | 190 ++++++++++++++---- tasks/eval_harness/evaluate.py | 3 +- tasks/eval_utils.py | 110 +++++++--- tasks/finetune_utils.py | 42 ++++ tasks/glue/cola.py | 90 +++++++++ tasks/glue/finetune.py | 59 +++++- tasks/glue/mrpc.py | 101 ++++++++++ tasks/glue/qnli.py | 101 ++++++++++ tasks/glue/rte.py | 101 ++++++++++ tasks/glue/sst2.py | 95 +++++++++ tasks/glue/stsb.py | 100 +++++++++ tasks/main.py | 2 +- 17 files changed, 1275 insertions(+), 63 deletions(-) create mode 100644 examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json create mode 100644 examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh create mode 100644 examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh create mode 100644 examples/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py create mode 100644 tasks/glue/cola.py create mode 100644 tasks/glue/mrpc.py create mode 100644 tasks/glue/qnli.py create mode 100644 tasks/glue/rte.py create mode 100644 tasks/glue/sst2.py create mode 100644 tasks/glue/stsb.py diff --git a/examples/data_efficiency/README.md b/examples/data_efficiency/README.md index e10db17d5..7ed96ae72 100644 --- a/examples/data_efficiency/README.md +++ b/examples/data_efficiency/README.md @@ -20,4 +20,4 @@ The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are u ``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality. -``bert/finetune`` includes the finetuning example scripts. \ No newline at end of file +``bert/finetune`` includes the MNLI/QQP/RACE finetuning example scripts following the [Megatron-LM paper](https://arxiv.org/abs/1909.08053). However, we found that the RACE task's accuracy is not very stable and the Megatron-LM paper used a very long number of epochs for MNLI/QQP which is not necessary. Thus we added capability of finetuning other GLUE tasks, and switched to follow the hyperparameters of the [original BERT paper](https://arxiv.org/abs/1810.04805). The corresponding scripts are at ``bert/finetune_glue``, which we recommend to use instead of ``bert/finetune``. Our [data efficiency paper](https://arxiv.org/abs/2212.03597) also uses the scripts under ``bert/finetune_glue`` for GLUE finetuning. \ No newline at end of file diff --git a/examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json b/examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json new file mode 100644 index 000000000..2700805d1 --- /dev/null +++ b/examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json @@ -0,0 +1,24 @@ +{ + "train_batch_size" : CONFIG_BATCH_SIZE, + "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, + "steps_per_print": LOG_INTERVAL, + + "zero_optimization": { + "stage": ZERO_STAGE, + "elastic_checkpoint": true + }, + + "gradient_clipping": 1.0, + "prescale_gradients": PRESCALE_GRAD, + + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 11 + }, + + "wall_clock_breakdown" : false +} diff --git a/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh b/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh new file mode 100644 index 000000000..0e0c571a4 --- /dev/null +++ b/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh @@ -0,0 +1,156 @@ +hostname_and_rank=$1 +master_port=$2 +seed=$3 +task=$4 +lr=$5 +pretrained_checkpoint=$6 + +# hostname_and_rank="worker-0:0,1,2,3" +# master_port=12345 +# seed=1234 +# task="MNLI" +# lr=2e-5 +# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp" + +############################################################################### +### Main configs +seq_len=512 + +global_batch_size=32 +epochs=3 + +train_data="/blob/data/GlueData/${task}/train.tsv" +valid_data="/blob/data/GlueData/${task}/dev.tsv" +if [[ "${task}" = "MNLI" ]]; then +valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \ + /blob/data/GlueData/MNLI/dev_mismatched.tsv" +fi + +## Adjust based on number of GPUs. +batch_size=8 + +## BERT 110M (BERT-Base) +# model_size=0.11 +# num_layers=12 +# hidden_size=768 +# num_attn_heads=12 + +## BERT 336M (BERT-Large) +model_size=0.336 +num_layers=24 +hidden_size=1024 +num_attn_heads=16 + +## BERT 1.3B +# model_size=1.3 +# num_layers=24 +# hidden_size=2048 +# num_attn_heads=32 + +## BERT 3.9B +# model_size=3.9 +# num_layers=48 +# hidden_size=2560 +# num_attn_heads=40 +############################################################################### +### Parallelism configs +## Model parallelism, 1 is no MP +mp_size=1 + +## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true. +## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's +## pipeline parallelism is only integrated with the GPT case, and currently +## DeepSpeed is not integrated with Megatron's own pipeline parallelism. +pp_size=1 +no_pp="true" + +## ZeRO stage +zero_stage=0 +############################################################################### +### Misc configs +log_interval=10 +eval_iters=50 +eval_interval=100 + +## Activation checkpointing saves GPU memory, but reduces training speed +# activation_checkpoint="true" +activation_checkpoint="false" +############################################################################### +vocab_file="bert-large-uncased-vocab.txt" +if [ ! -f "$vocab_file" ]; then + wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt +fi + +jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}" +# output_path="${pretrained_checkpoint}-finetune-glue-4v100/${jobname}" +output_path=$(basename "$pretrained_checkpoint") +output_path="glue-results/${output_path}-finetune-glue-4v100/${jobname}" +mkdir -p ${output_path} + +template_json="ds_config_bert_TEMPLATE.json" +config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json" +if [[ $zero_stage -gt 0 ]]; then +sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \ + | sed "s/CONFIG_MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/false/" \ + | sed "s/CONFIG_FP16_ENABLED/true/" \ + | sed "s/CONFIG_BF16_ENABLED/false/" \ + > ${config_json} +else +sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \ + | sed "s/CONFIG_MBSIZE/${batch_size}/" \ + | sed "s/LOG_INTERVAL/${log_interval}/" \ + | sed "s/ZERO_STAGE/${zero_stage}/" \ + | sed "s/PRESCALE_GRAD/true/" \ + | sed "s/CONFIG_FP16_ENABLED/true/" \ + | sed "s/CONFIG_BF16_ENABLED/false/" \ + > ${config_json} +fi + +options=" \ + --finetune \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${zero_stage} \ + --task ${task} \ + --seed ${seed} \ + --train-data ${train_data} \ + --valid-data ${valid_data} \ + --tokenizer-type BertWordPieceLowerCase \ + --vocab-file ${vocab_file} \ + --epochs ${epochs} \ + --pretrained-checkpoint ${pretrained_checkpoint} \ + --tensor-model-parallel-size ${mp_size} \ + --pipeline-model-parallel-size ${pp_size} \ + --num-layers ${num_layers} \ + --hidden-size ${hidden_size} \ + --num-attention-heads ${num_attn_heads} \ + --global-batch-size ${global_batch_size} \ + --micro-batch-size ${batch_size} \ + --lr ${lr} \ + --lr-decay-style linear \ + --lr-warmup-fraction 0.1 \ + --seq-length ${seq_len} \ + --max-position-embeddings ${seq_len} \ + --log-interval ${log_interval} \ + --eval-interval ${eval_interval} \ + --eval-iters ${eval_iters} \ + --weight-decay 1.0e-1 \ + --fp16" + +if [ "${activation_checkpoint}" = "true" ]; then +options="${options} \ + --checkpoint-activations \ + --deepspeed-activation-checkpointing" +fi + +if [[ "${no_pp}" = "true" ]]; then +options="${options} \ + --no-pipeline-parallel" +fi + +# After the fine-tuning finishes, you can find the dev set accuracy numbers by +# "grep -e "overall:" -e "metrics for" ${output_path}/output.log" +deepspeed --include=${hostname_and_rank} --master_port=${master_port} ../../../../tasks/main.py ${options} &> ${output_path}/output.log diff --git a/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh b/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh new file mode 100644 index 000000000..10e04f2c7 --- /dev/null +++ b/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh @@ -0,0 +1,44 @@ +hostname_and_rank=$1 +master_port=$2 +pretrained_checkpoint=$3 + +# hostname_and_rank="worker-0:0,1,2,3" +# master_port=12345 +# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp" + +tasks=( + RTE + MRPC + STS-B + CoLA + SST-2 + QNLI + QQP + MNLI +) + +seeds=( + 1234 + 1235 + 1236 + 1237 + 1238 +) + +lrs=( + 2e-5 + 3e-5 + 4e-5 + 5e-5 +) + +for ((i=0;i<${#tasks[@]};++i)); do + task=${tasks[i]} + for ((j=0;j<${#seeds[@]};++j)); do + seed=${seeds[j]} + for ((k=0;k<${#lrs[@]};++k)); do + lr=${lrs[k]} + bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint} + done + done +done \ No newline at end of file diff --git a/examples/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py b/examples/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py new file mode 100644 index 000000000..b359ecb6f --- /dev/null +++ b/examples/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py @@ -0,0 +1,118 @@ +import os +import statistics + +def gather_numbers(fname, match_keywords, index_keywords, index_offsets): + results = {} + for k in index_keywords: + results[k] = [] + file1 = open(fname, 'r') + while True: + line = file1.readline() + if not line: + break + splits = line.split(' ') + for i in range(len(match_keywords)): + if match_keywords[i] in line: + ref_idx = splits.index(index_keywords[i]) + results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]])) + file1.close() + return results + +def gather_GLUE_results(result_path, key, lr): + result = [] + mnli_matched_result = [] + mnli_mismatched_result = [] + for file in os.listdir(result_path): + if file.startswith(key) and lr in file: + fname = f'{result_path}/{file}/output.log' + if os.path.exists(fname): + if key == "STS-B": + results = gather_numbers(fname, ['metrics for'], ['spearmanr'], [2]) + overall_candidate = results['spearmanr'] + overall_candidate = [x * 100.0 for x in overall_candidate] + elif key == "CoLA": + results = gather_numbers(fname, ['metrics for'], ['mcc'], [2]) + overall_candidate = results['mcc'] + overall_candidate = [x * 100.0 for x in overall_candidate] + elif key == "MNLI": + results = gather_numbers(fname, + ['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'], + ['overall:', 'dev-matched:', 'dev-mismatched:'], + [9, 9, 9]) + overall_candidate = results['overall:'] + matched_candidate = results['dev-matched:'] + mismatched_candidate = results['dev-mismatched:'] + else: + results = gather_numbers(fname, ['overall:'], ['overall:'], [9]) + overall_candidate = results['overall:'] + if len(overall_candidate) > 0: + if len(overall_candidate) != 3: + print(f"{result_path} task {key} lr {lr} only has {len(overall_candidate)} epoch") + best_index = overall_candidate.index(max(overall_candidate)) + result.append(overall_candidate[best_index]) + if key == "MNLI": + mnli_matched_result.append(matched_candidate[best_index]) + mnli_mismatched_result.append(mismatched_candidate[best_index]) + if len(result) > 0: + if len(result) != 5: + print(f"{result_path} task {key} lr {lr} only has {len(result)} seed") + if key == "MNLI": + best_index = result.index(statistics.median_high(result)) + return round(mnli_matched_result[best_index],2), round(statistics.stdev(mnli_matched_result),2), round(mnli_mismatched_result[best_index],2), round(statistics.stdev(mnli_mismatched_result),2) + else: + return round(statistics.median_high(result),2), round(statistics.stdev(result),2) + else: + if key == "MNLI": + return None, None, None, None + else: + return None, None + +def gather_finetune_results(result_path, extra_col=[], lr="2e-5"): + output = "" + for field in extra_col: + output += f"{field} &" + task_output = "" + median_list, std_list = [], [] + m_median, m_std, mm_median, mm_std = gather_GLUE_results(result_path, "MNLI", lr) + if m_median is not None: + median_list += [m_median, mm_median] + std_list += [m_std, mm_std] + task_output += f"{m_median}±{m_std} & {mm_median}±{mm_std} &" + tasks = ["QQP", "QNLI", "SST-2", "CoLA", "STS-B", "MRPC", "RTE"] + for task in tasks: + t_median, t_std = gather_GLUE_results(result_path, task, lr) + if t_median is not None: + median_list += [t_median] + std_list += [t_std] + if task == "RTE": + task_output += f"{t_median}±{t_std} " + else: + task_output += f"{t_median}±{t_std} &" + overall_median = round(sum(median_list) / len(median_list), 2) + overall_std = round(sum(std_list) / len(std_list), 2) + output += f"{overall_median}±{overall_std} &" + output += task_output + output += " \\\\" + print(output) + +if __name__ == '__main__': + print("\\begin{table}") + print("\centering") + print("\\tiny") + text = "\\begin{tabular}{@{}l|" + for _ in range(11): + text += "c" + text += "@{}}" + print(text) + print("\\toprule") + print("Case & Train tokens & Average & MNLI-m & MNLI-mm & QQP & QNLI & SST-2 & CoLA & STS-B & MRPC & RTE \\\\") + print("\midrule") + + result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/' + gather_finetune_results(result_path) + + print("\\bottomrule") + print("\end{tabular}") + print("\end{table}") + print("") + print("") \ No newline at end of file diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh index f03c65ccf..c771a0e27 100644 --- a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh +++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh @@ -8,26 +8,35 @@ # train_iters_in_million=2 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} ############################################################################### +## Baseline 703B tokens (67%): +# lr=1.5e-4 +# train_iters_in_million=134e-2 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} +############################################################################### +## Baseline 524B tokens (50%): +# lr=2e-4 +# train_iters_in_million=1 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} +############################################################################### ### Curriculum learning (CL) + Random layerwise token dropping (random-LTD). -### Due to resource constraints, we did not finish training any model with this -### setup. This example is just to demonstrate that CL+random-LTD can run for +### DeepSpeed Data Efficiency's composed solution. ### BERT pretraining. ## CL+random-LTD 1049B tokens (100%): # lr=1e-4 # train_iters_in_million=2 # ltd_enabled="true" -# ltd_start=200 -# ltd_step_in_million=18e-1 -# dropout=0 +# ltd_start=128 +# ltd_step_in_million=2 +# dropout=1e-1 # cl_enabled="true" # cl_num_metric=2 # cl_1st_metric="voc" -# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" -# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" -# cl_1st_difficulty_type="value" +# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" # cl_1st_clustering_type="schedule_based" -# cl_1st_min=600 -# cl_1st_max=9069 +# cl_1st_min=5 +# cl_1st_max=100 # cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=1 # cl_1st_root=2 @@ -50,46 +59,159 @@ # ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ # ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ # ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ -# ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} \ +# ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +## CL+random-LTD 524B tokens (50%): +# lr=2e-4 +# train_iters_in_million=1 +# ltd_enabled="true" +# ltd_start=128 +# ltd_step_in_million=1 +# dropout=1e-1 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=5 +# cl_1st_max=100 +# cl_1st_total_step_in_million=48e-2 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=128 +# cl_2nd_max=512 +# cl_2nd_total_step_in_million=48e-2 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} ############################################################################### ### Random layerwise token dropping (random-LTD). -## random-LTD 723B tokens (69%): -# lr=1.45e-4 -# train_iters_in_million=138e-2 +## random-LTD 1049B tokens (100%): +# lr=1e-4 +# train_iters_in_million=2 # ltd_enabled="true" -# ltd_start=200 -# ltd_step_in_million=18e-1 -# dropout=0 +# ltd_start=128 +# ltd_step_in_million=2 +# dropout=1e-1 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} +############################################################################### +## random-LTD 703B tokens (67%): +# lr=1.5e-4 +# train_iters_in_million=134e-2 +# ltd_enabled="true" +# ltd_start=128 +# ltd_step_in_million=134e-2 +# dropout=1e-1 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} +############################################################################### +## random-LTD 524B tokens (50%): +# lr=2e-4 +# train_iters_in_million=1 +# ltd_enabled="true" +# ltd_start=128 +# ltd_step_in_million=1 +# dropout=1e-1 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ # ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} ############################################################################### ### Curriculum learning (CL). -## CL vocab rarity 734B tokens (70%): -# lr=1.4e-4 -# train_iters_in_million=14e-1 +## CL vocab rarity + seqlen truncation 524B tokens (50%): +# lr=2e-4 +# train_iters_in_million=1 # ltd_enabled="false" # ltd_start=512 # ltd_step_in_million=1 # dropout=1e-1 # cl_enabled="true" -# cl_num_metric=1 +# cl_num_metric=2 # cl_1st_metric="voc" -# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" -# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" -# cl_1st_difficulty_type="value" +# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" # cl_1st_clustering_type="schedule_based" -# cl_1st_min=600 -# cl_1st_max=9069 -# cl_1st_total_step_in_million=7e-1 +# cl_1st_min=5 +# cl_1st_max=100 +# cl_1st_total_step_in_million=48e-2 # cl_1st_difficulty_step=1 # cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=128 +# cl_2nd_max=512 +# cl_2nd_total_step_in_million=48e-2 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ # ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ # ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ # ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ # ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ # ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ -# ${cl_1st_root} +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} +############################################################################### +## CL vocab rarity + seqlen truncation 703B tokens (67%): +# lr=1.5e-4 +# train_iters_in_million=134e-2 +# ltd_enabled="false" +# ltd_start=512 +# ltd_step_in_million=1 +# dropout=1e-1 +# cl_enabled="true" +# cl_num_metric=2 +# cl_1st_metric="voc" +# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged" +# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" +# cl_1st_difficulty_type="percentile" +# cl_1st_clustering_type="schedule_based" +# cl_1st_min=5 +# cl_1st_max=100 +# cl_1st_total_step_in_million=64e-2 +# cl_1st_difficulty_step=1 +# cl_1st_root=2 +# cl_2nd_metric="seqlen_truncate" +# cl_2nd_index_to_sample_path="dummy" +# cl_2nd_index_to_metric_path="dummy" +# cl_2nd_difficulty_type="value" +# cl_2nd_clustering_type="single_cluster" +# cl_2nd_min=128 +# cl_2nd_max=512 +# cl_2nd_total_step_in_million=64e-2 +# cl_2nd_difficulty_step=8 +# cl_2nd_root=1 +# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \ +# ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \ +# ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \ +# ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \ +# ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \ +# ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \ +# ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \ +# ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \ +# ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \ +# ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} ############################################################################### ## CL vocab rarity + seqlen truncation 1049B tokens (100%): # lr=1e-4 @@ -103,10 +225,10 @@ # cl_1st_metric="voc" # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" # cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" -# cl_1st_difficulty_type="value" +# cl_1st_difficulty_type="percentile" # cl_1st_clustering_type="schedule_based" -# cl_1st_min=600 -# cl_1st_max=9069 +# cl_1st_min=5 +# cl_1st_max=100 # cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=1 # cl_1st_root=2 @@ -170,10 +292,10 @@ # cl_1st_metric="voc" # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample" # cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric" -# cl_1st_difficulty_type="value" +# cl_1st_difficulty_type="percentile" # cl_1st_clustering_type="schedule_based" -# cl_1st_min=600 -# cl_1st_max=9069 +# cl_1st_min=5 +# cl_1st_max=100 # cl_1st_total_step_in_million=96e-2 # cl_1st_difficulty_step=1 # cl_1st_root=2 diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 394261e3f..7b692b169 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -36,6 +36,7 @@ from megatron.model.distributed import DistributedDataParallel as LocalDDP from megatron.model.module import Float16Module from deepspeed.runtime.pipe import schedule +from deepspeed.accelerator import get_accelerator class EvalHarnessAdaptor(GPT2LM): def __init__(self, model, tokenizer): @@ -330,7 +331,7 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider): cp_args = ds_checkpoint.get_args() # Merge the current args with the checkpoint args. skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config', - 'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'moe_expert_parallel_size', 'moe_token_dropping', 'load', 'rampup_batch_size', 'iteration', 'inference'] + 'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'moe_expert_parallel_size', 'moe_token_dropping', 'load', 'rampup_batch_size', 'iteration', 'inference', 'random_ltd'] skip_if_specified = ['merge_file', 'vocab_file'] diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py index 6c15732c6..da7653929 100644 --- a/tasks/eval_utils.py +++ b/tasks/eval_utils.py @@ -64,7 +64,9 @@ def metrics_func(model, epoch, output_predictions=False): correct += correct_ans total += total_count if is_last_rank(): - percent = float(correct) * 100.0 / float(total) + percent = 0 + if total > 0: + percent = float(correct) * 100.0 / float(total) print(' >> |epoch: {}| overall: correct / total = {} / {} = ' '{:.4f} %'.format(epoch, correct, total, percent)) @@ -102,6 +104,7 @@ def calculate_correct_answers(name, model, dataloader, num_micro_batches = args.orig_global_batch_size // micro_batch_size_times_data_parallel def loss_func(output_predictions, labels, output_tensor): + args = get_args() logits = output_tensor loss_dict = {} @@ -113,11 +116,20 @@ def loss_func(output_predictions, labels, output_tensor): loss_dict['labels'] = labels.data.cpu().numpy().tolist() loss_dict['ids'] = batch['uid'].cpu().numpy().tolist() # Compute the correct answers. - predicted = torch.argmax(logits, dim=-1) - corrects = (predicted == labels) - # Add to the counters. - loss_dict['total'] = labels.size(0) - loss_dict['correct'] = corrects.sum().item() + if args.finetune and args.task == 'CoLA': + predicted = torch.argmax(logits, dim=-1) + loss_dict['labels'] = labels.data.cpu().numpy().tolist() + loss_dict['predicted'] = predicted.data.cpu().numpy().tolist() + elif args.finetune and args.task == 'STS-B': + predicted = torch.squeeze(logits) + loss_dict['labels'] = labels.data.cpu().numpy().tolist() + loss_dict['predicted'] = predicted.data.cpu().numpy().tolist() + else: + predicted = torch.argmax(logits, dim=-1) + corrects = (predicted == labels) + # Add to the counters. + loss_dict['total'] = labels.size(0) + loss_dict['correct'] = corrects.sum().item() return 0, loss_dict @@ -139,6 +151,8 @@ def correct_answers_forward_step(batch, model): # For all the batches in the dataset. total = 0 correct = 0 + labels = [] + predicted = [] if output_predictions: # This option is only possible when data parallel size is 1. assert mpu.get_data_parallel_world_size() == 1 @@ -162,8 +176,12 @@ def correct_answers_forward_step(batch, model): softmaxes.extend(loss_dict['softmaxes']) labels.extend(loss_dict['labels']) ids.extend(loss_dict['ids']) - total += loss_dict['total'] - correct += loss_dict['correct'] + if args.finetune and args.task in ['CoLA', 'STS-B']: + labels.extend(loss_dict['labels']) + predicted.extend(loss_dict['predicted']) + else: + total += loss_dict['total'] + correct += loss_dict['correct'] for m in model: @@ -173,24 +191,70 @@ def correct_answers_forward_step(batch, model): # Reduce. if mpu.is_pipeline_last_stage(): - unreduced = get_accelerator().LongTensor([correct, total]) - torch.distributed.all_reduce(unreduced, - group=mpu.get_data_parallel_group()) + if args.finetune and args.task in ['CoLA', 'STS-B']: + if args.task == 'CoLA': + labels = get_accelerator().LongTensor(labels) + predicted = get_accelerator().LongTensor(predicted) + labels_gather = [torch.zeros(len(labels), dtype=torch.long, + device=labels.device) for _ in range(mpu.get_data_parallel_world_size())] + predicted_gather = [torch.zeros(len(predicted), dtype=torch.long, + device=predicted.device) for _ in range(mpu.get_data_parallel_world_size())] + else: + labels = get_accelerator().FloatTensor(labels) + predicted = get_accelerator().FloatTensor(predicted) + labels_gather = [torch.zeros(len(labels), dtype=torch.float, + device=labels.device) for _ in range(mpu.get_data_parallel_world_size())] + predicted_gather = [torch.zeros(len(predicted), dtype=torch.float, + device=predicted.device) for _ in range(mpu.get_data_parallel_world_size())] + torch.distributed.all_gather(labels_gather, labels, + group=mpu.get_data_parallel_group()) + torch.distributed.all_gather(predicted_gather, predicted, + group=mpu.get_data_parallel_group()) - # Print on screen. + labels_gather = sum([x.data.cpu().numpy().tolist() for x in labels_gather], []) + predicted_gather = sum([x.data.cpu().numpy().tolist() for x in predicted_gather], []) - correct_ans = unreduced[0].item() - total_count = unreduced[1].item() - percent = float(correct_ans) * 100.0 / float(total_count) - elapsed_time = time.time() - start_time - print_rank_last(' > |epoch: {}| metrics for {}: correct / total ' - '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format( - epoch, name, correct_ans, total_count, - percent, elapsed_time)) + # Print on screen. + if args.task == 'CoLA': + from sklearn.metrics import matthews_corrcoef + mcc = matthews_corrcoef(labels_gather, predicted_gather) + elapsed_time = time.time() - start_time + print_rank_last(' > |epoch: {}| metrics for {}: mcc ' + '= {} , elapsed time (sec): {:.3f}'.format( + epoch, name, mcc, elapsed_time)) + else: + from scipy.stats import pearsonr, spearmanr + pearson_corr = pearsonr(predicted_gather, labels_gather)[0] + spearman_corr = spearmanr(predicted_gather, labels_gather)[0] + corr = (pearson_corr + spearman_corr) / 2 + elapsed_time = time.time() - start_time + print_rank_last(' > |epoch: {}| metrics for {}: pearson ' + '= {} spearmanr = {} corr = {} elapsed time (sec): {:.3f}'.format( + epoch, name, pearson_corr, spearman_corr, + corr, elapsed_time)) - if output_predictions: - return correct_ans, total_count, (softmaxes, labels, ids) - return correct_ans, total_count + if output_predictions: + return 0, 0, () + return 0, 0 + else: + unreduced = get_accelerator().LongTensor([correct, total]) + torch.distributed.all_reduce(unreduced, + group=mpu.get_data_parallel_group()) + + # Print on screen. + + correct_ans = unreduced[0].item() + total_count = unreduced[1].item() + percent = float(correct_ans) * 100.0 / float(total_count) + elapsed_time = time.time() - start_time + print_rank_last(' > |epoch: {}| metrics for {}: correct / total ' + '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format( + epoch, name, correct_ans, total_count, + percent, elapsed_time)) + + if output_predictions: + return correct_ans, total_count, (softmaxes, labels, ids) + return correct_ans, total_count if output_predictions: return 0, 0, () return 0, 0 diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py index fd4e79a98..c9f2daf6d 100644 --- a/tasks/finetune_utils.py +++ b/tasks/finetune_utils.py @@ -79,6 +79,48 @@ def _cross_entropy_forward_step(batch, model): return output_tensor, partial(cross_entropy_loss_func, labels) +def process_batch_mse(batch): + """Process batch and produce inputs for the model.""" + args = get_args() + + tokens = batch['text'].long().to(get_accelerator().device_name()).contiguous() + types = batch['types'].long().to(get_accelerator().device_name()).contiguous() + labels = batch['label'].float().to(get_accelerator().device_name()).contiguous() + attention_mask = batch['padding_mask'].float().to(get_accelerator().device_name()).contiguous() + if args.fp16: + attention_mask = attention_mask.half() + + return tokens, types, labels, attention_mask + +def mse_loss_func(labels, output_tensor): + logits = output_tensor + + # Cross-entropy loss. + loss_func = torch.nn.MSELoss() + loss = loss_func(logits.contiguous().float().view(-1), labels.view(-1)) + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + +def mse_forward_step(batch, model): + """Simple forward step with cross-entropy loss.""" + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + try: + batch_ = next(batch) + except BaseException: + batch_ = batch + tokens, types, labels, attention_mask = process_batch_mse(batch_) + timers('batch-generator').stop() + + # Forward model. + output_tensor = model(tokens, attention_mask, tokentype_ids=types) + + return output_tensor, partial(mse_loss_func, labels) def build_data_loader(dataset, micro_batch_size, num_workers, drop_last): """Data loader. Note that batch-size is the local (per GPU) batch-size.""" diff --git a/tasks/glue/cola.py b/tasks/glue/cola.py new file mode 100644 index 000000000..123f79533 --- /dev/null +++ b/tasks/glue/cola.py @@ -0,0 +1,90 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CoLA dataset.""" + +from megatron import print_rank_0 +from tasks.data_utils import clean_text +from .data import GLUEAbstractDataset + + +LABELS = [0, 1] + + +class CoLADataset(GLUEAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, + test_label=0): + self.test_label = test_label + super().__init__('CoLA', name, datapaths, + tokenizer, max_seq_length) + + def process_samples_from_single_path(self, filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + + samples = [] + total = 0 + first = True + is_test = False + with open(filename, 'r') as f: + for line in f: + row = line.strip().split('\t') + if first: + first = False + if len(row) == 2: + is_test = True + print_rank_0(' reading {} and {} columns and ' + 'setting labels to {}'.format( + row[0].strip(), row[1].strip(), + self.test_label)) + continue + + if is_test: + assert len(row) == 2, 'expected length 2: {}'.format(row) + uid = int(row[0].strip()) + text_a = clean_text(row[1].strip()) + text_b = None + label = self.test_label + assert len(text_a) > 0 + else: + if len(row) == 4: + uid = total + text_a = clean_text(row[3].strip()) + text_b = None + label = int(row[1].strip()) + else: + print_rank_0('***WARNING*** index error, ' + 'skipping: {}'.format(row)) + continue + if len(text_a) == 0: + print_rank_0('***WARNING*** zero length a, ' + 'skipping: {}'.format(row)) + continue + assert label in LABELS + assert uid >= 0 + + sample = {'uid': uid, + 'text_a': text_a, + 'text_b': text_b, + 'label': label} + total += 1 + samples.append(sample) + + if total % 50000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/glue/finetune.py b/tasks/glue/finetune.py index ad1938b0c..9f57734b0 100644 --- a/tasks/glue/finetune.py +++ b/tasks/glue/finetune.py @@ -21,7 +21,7 @@ from megatron import mpu from megatron.model.classification import Classification from tasks.eval_utils import accuracy_func_provider -from tasks.finetune_utils import finetune +from tasks.finetune_utils import finetune, mse_forward_step def glue_classification(num_classes, Dataset, @@ -60,9 +60,15 @@ def single_dataset_provider(datapath): return Dataset(name, [datapath], tokenizer, args.seq_length) return accuracy_func_provider(single_dataset_provider) + args = get_args() """Finetune/evaluate.""" - finetune(train_valid_datasets_provider, model_provider, - end_of_epoch_callback_provider=metrics_func_provider) + if args.task == 'STS-B': + finetune(train_valid_datasets_provider, model_provider, + forward_step=mse_forward_step, + end_of_epoch_callback_provider=metrics_func_provider) + else: + finetune(train_valid_datasets_provider, model_provider, + end_of_epoch_callback_provider=metrics_func_provider) def main(): @@ -85,7 +91,54 @@ def name_from_datapath(datapath): def name_from_datapath(datapath): return datapath.split('QQP')[-1].strip( '.tsv').strip('/').replace('_', '-') + elif args.task == 'QNLI': + + num_classes = 2 + from tasks.glue.qnli import QNLIDataset as Dataset + + def name_from_datapath(datapath): + return datapath.split('QNLI')[-1].strip( + '.tsv').strip('/').replace('_', '-') + elif args.task == 'SST-2': + + num_classes = 2 + from tasks.glue.sst2 import SST2Dataset as Dataset + + def name_from_datapath(datapath): + return datapath.split('SST-2')[-1].strip( + '.tsv').strip('/').replace('_', '-') + elif args.task == 'CoLA': + + num_classes = 2 + from tasks.glue.cola import CoLADataset as Dataset + + def name_from_datapath(datapath): + return datapath.split('CoLA')[-1].strip( + '.tsv').strip('/').replace('_', '-') + elif args.task == 'STS-B': + + num_classes = 1 + from tasks.glue.stsb import STSBDataset as Dataset + + def name_from_datapath(datapath): + return datapath.split('STS-B')[-1].strip( + '.tsv').strip('/').replace('_', '-') + elif args.task == 'MRPC': + + num_classes = 2 + from tasks.glue.mrpc import MRPCDataset as Dataset + def name_from_datapath(datapath): + return datapath.split('MRPC')[-1].strip( + '.tsv').strip('/').replace('_', '-') + elif args.task == 'RTE': + + num_classes = 2 + from tasks.glue.rte import RTEDataset as Dataset + + def name_from_datapath(datapath): + return datapath.split('RTE')[-1].strip( + '.tsv').strip('/').replace('_', '-') else: raise NotImplementedError('GLUE task {} is not implemented.'.format( args.task)) diff --git a/tasks/glue/mrpc.py b/tasks/glue/mrpc.py new file mode 100644 index 000000000..8dfac8402 --- /dev/null +++ b/tasks/glue/mrpc.py @@ -0,0 +1,101 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MRPC dataset.""" + +from megatron import print_rank_0 +from tasks.data_utils import clean_text +from .data import GLUEAbstractDataset + + +LABELS = [0, 1] + + +class MRPCDataset(GLUEAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, + test_label=0): + self.test_label = test_label + super().__init__('MRPC', name, datapaths, + tokenizer, max_seq_length) + + def process_samples_from_single_path(self, filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + + samples = [] + total = 0 + first = True + is_test = False + with open(filename, 'r') as f: + for line in f: + row = line.strip().split('\t') + if first: + first = False + if row[0].strip() == 'index': + is_test = True + print_rank_0(' reading {}, {}, and {} columns and ' + 'setting labels to {}'.format( + row[0].strip(), row[3].strip(), + row[4].strip(), self.test_label)) + else: + assert len(row) == 5 + print_rank_0(' reading {}, {}, and {} columns' + ' ...'.format( + row[0].strip(), row[3].strip(), + row[4].strip())) + continue + + if is_test: + assert len(row) == 5, 'expected length 5: {}'.format(row) + uid = int(row[0].strip()) + text_a = clean_text(row[3].strip()) + text_b = clean_text(row[4].strip()) + label = self.test_label + assert len(text_a) > 0 + assert len(text_b) > 0 + else: + if len(row) == 5: + uid = total + text_a = clean_text(row[3].strip()) + text_b = clean_text(row[4].strip()) + label = int(row[0].strip()) + else: + print_rank_0('***WARNING*** index error, ' + 'skipping: {}'.format(row)) + continue + if len(text_a) == 0: + print_rank_0('***WARNING*** zero length a, ' + 'skipping: {}'.format(row)) + continue + if len(text_b) == 0: + print_rank_0('***WARNING*** zero length b, ' + 'skipping: {}'.format(row)) + continue + assert label in LABELS + assert uid >= 0 + + sample = {'uid': uid, + 'text_a': text_a, + 'text_b': text_b, + 'label': label} + total += 1 + samples.append(sample) + + if total % 50000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/glue/qnli.py b/tasks/glue/qnli.py new file mode 100644 index 000000000..af0841d4a --- /dev/null +++ b/tasks/glue/qnli.py @@ -0,0 +1,101 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""QNLI dataset.""" + +from megatron import print_rank_0 +from tasks.data_utils import clean_text +from .data import GLUEAbstractDataset + + +LABELS = {'entailment': 0, 'not_entailment': 1} + + +class QNLIDataset(GLUEAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, + test_label='entailment'): + self.test_label = test_label + super().__init__('QNLI', name, datapaths, + tokenizer, max_seq_length) + + def process_samples_from_single_path(self, filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + + samples = [] + total = 0 + first = True + is_test = False + with open(filename, 'r') as f: + for line in f: + row = line.strip().split('\t') + if first: + first = False + if len(row) == 3: + is_test = True + print_rank_0(' reading {}, {}, and {} columns and ' + 'setting labels to {}'.format( + row[0].strip(), row[1].strip(), + row[2].strip(), self.test_label)) + else: + assert len(row) == 4 + print_rank_0(' reading {}, {}, {}, and {} columns' + ' ...'.format( + row[0].strip(), row[1].strip(), + row[2].strip(), row[3].strip())) + continue + + if is_test: + assert len(row) == 3, 'expected length 3: {}'.format(row) + uid = int(row[0].strip()) + text_a = clean_text(row[1].strip()) + text_b = clean_text(row[2].strip()) + label = self.test_label + assert len(text_a) > 0 + assert len(text_b) > 0 + else: + if len(row) == 4: + uid = int(row[0].strip()) + text_a = clean_text(row[1].strip()) + text_b = clean_text(row[2].strip()) + label = row[-1].strip() + else: + print_rank_0('***WARNING*** index error, ' + 'skipping: {}'.format(row)) + continue + if len(text_a) == 0: + print_rank_0('***WARNING*** zero length a, ' + 'skipping: {}'.format(row)) + continue + if len(text_b) == 0: + print_rank_0('***WARNING*** zero length b, ' + 'skipping: {}'.format(row)) + continue + assert label in LABELS + assert uid >= 0 + + sample = {'uid': uid, + 'text_a': text_a, + 'text_b': text_b, + 'label': LABELS[label]} + total += 1 + samples.append(sample) + + if total % 50000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/glue/rte.py b/tasks/glue/rte.py new file mode 100644 index 000000000..096a26ecc --- /dev/null +++ b/tasks/glue/rte.py @@ -0,0 +1,101 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""RTE dataset.""" + +from megatron import print_rank_0 +from tasks.data_utils import clean_text +from .data import GLUEAbstractDataset + + +LABELS = {'entailment': 0, 'not_entailment': 1} + + +class RTEDataset(GLUEAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, + test_label='entailment'): + self.test_label = test_label + super().__init__('RTE', name, datapaths, + tokenizer, max_seq_length) + + def process_samples_from_single_path(self, filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + + samples = [] + total = 0 + first = True + is_test = False + with open(filename, 'r') as f: + for line in f: + row = line.strip().split('\t') + if first: + first = False + if len(row) == 3: + is_test = True + print_rank_0(' reading {}, {}, and {} columns and ' + 'setting labels to {}'.format( + row[0].strip(), row[1].strip(), + row[2].strip(), self.test_label)) + else: + assert len(row) == 4 + print_rank_0(' reading {}, {}, {}, and {} columns' + ' ...'.format( + row[0].strip(), row[1].strip(), + row[2].strip(), row[3].strip())) + continue + + if is_test: + assert len(row) == 3, 'expected length 3: {}'.format(row) + uid = int(row[0].strip()) + text_a = clean_text(row[1].strip()) + text_b = clean_text(row[2].strip()) + label = self.test_label + assert len(text_a) > 0 + assert len(text_b) > 0 + else: + if len(row) == 4: + uid = int(row[0].strip()) + text_a = clean_text(row[1].strip()) + text_b = clean_text(row[2].strip()) + label = row[-1].strip() + else: + print_rank_0('***WARNING*** index error, ' + 'skipping: {}'.format(row)) + continue + if len(text_a) == 0: + print_rank_0('***WARNING*** zero length a, ' + 'skipping: {}'.format(row)) + continue + if len(text_b) == 0: + print_rank_0('***WARNING*** zero length b, ' + 'skipping: {}'.format(row)) + continue + assert label in LABELS + assert uid >= 0 + + sample = {'uid': uid, + 'text_a': text_a, + 'text_b': text_b, + 'label': LABELS[label]} + total += 1 + samples.append(sample) + + if total % 50000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/glue/sst2.py b/tasks/glue/sst2.py new file mode 100644 index 000000000..966efc247 --- /dev/null +++ b/tasks/glue/sst2.py @@ -0,0 +1,95 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""SST-2 dataset.""" + +from megatron import print_rank_0 +from tasks.data_utils import clean_text +from .data import GLUEAbstractDataset + + +LABELS = [0, 1] + + +class SST2Dataset(GLUEAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, + test_label=0): + self.test_label = test_label + super().__init__('SST-2', name, datapaths, + tokenizer, max_seq_length) + + def process_samples_from_single_path(self, filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + + samples = [] + total = 0 + first = True + is_test = False + with open(filename, 'r') as f: + for line in f: + row = line.strip().split('\t') + if first: + first = False + if row[0].strip() == 'index': + is_test = True + print_rank_0(' reading {} and {} columns and ' + 'setting labels to {}'.format( + row[0].strip(), row[1].strip(), + self.test_label)) + else: + assert len(row) == 2 + print_rank_0(' reading {} and {} columns' + ' ...'.format( + row[0].strip(), row[1].strip())) + continue + + if is_test: + assert len(row) == 2, 'expected length 2: {}'.format(row) + uid = int(row[0].strip()) + text_a = clean_text(row[1].strip()) + text_b = None + label = self.test_label + assert len(text_a) > 0 + else: + if len(row) == 2: + uid = total + text_a = clean_text(row[0].strip()) + text_b = None + label = int(row[-1].strip()) + else: + print_rank_0('***WARNING*** index error, ' + 'skipping: {}'.format(row)) + continue + if len(text_a) == 0: + print_rank_0('***WARNING*** zero length a, ' + 'skipping: {}'.format(row)) + continue + assert label in LABELS + assert uid >= 0 + + sample = {'uid': uid, + 'text_a': text_a, + 'text_b': text_b, + 'label': label} + total += 1 + samples.append(sample) + + if total % 50000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/glue/stsb.py b/tasks/glue/stsb.py new file mode 100644 index 000000000..692724620 --- /dev/null +++ b/tasks/glue/stsb.py @@ -0,0 +1,100 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""STS-B dataset.""" + +from megatron import print_rank_0 +from tasks.data_utils import clean_text +from .data import GLUEAbstractDataset + + +LABELS = [None] + + +class STSBDataset(GLUEAbstractDataset): + + def __init__(self, name, datapaths, tokenizer, max_seq_length, + test_label=0.0): + self.test_label = test_label + super().__init__('STS-B', name, datapaths, + tokenizer, max_seq_length) + + def process_samples_from_single_path(self, filename): + """"Implement abstract method.""" + print_rank_0(' > Processing {} ...'.format(filename)) + + samples = [] + total = 0 + first = True + is_test = False + with open(filename, 'r') as f: + for line in f: + row = line.strip().split('\t') + if first: + first = False + if len(row) == 9: + is_test = True + print_rank_0(' reading {}, {}, and {} columns and ' + 'setting labels to {}'.format( + row[0].strip(), row[7].strip(), + row[8].strip(), self.test_label)) + else: + assert len(row) == 10 + print_rank_0(' reading {}, {}, {}, and {} columns' + ' ...'.format( + row[0].strip(), row[7].strip(), + row[8].strip(), row[-1].strip())) + continue + + if is_test: + assert len(row) == 9, 'expected length 9: {}'.format(row) + uid = int(row[0].strip()) + text_a = clean_text(row[7].strip()) + text_b = clean_text(row[8].strip()) + label = self.test_label + assert len(text_a) > 0 + assert len(text_b) > 0 + else: + if len(row) == 10: + uid = int(row[0].strip()) + text_a = clean_text(row[7].strip()) + text_b = clean_text(row[8].strip()) + label = float(row[-1].strip()) + else: + print_rank_0('***WARNING*** index error, ' + 'skipping: {}'.format(row)) + continue + if len(text_a) == 0: + print_rank_0('***WARNING*** zero length a, ' + 'skipping: {}'.format(row)) + continue + if len(text_b) == 0: + print_rank_0('***WARNING*** zero length b, ' + 'skipping: {}'.format(row)) + continue + assert uid >= 0 + + sample = {'uid': uid, + 'text_a': text_a, + 'text_b': text_b, + 'label': label} + total += 1 + samples.append(sample) + + if total % 50000 == 0: + print_rank_0(' > processed {} so far ...'.format(total)) + + print_rank_0(' >> processed {} samples.'.format(len(samples))) + return samples diff --git a/tasks/main.py b/tasks/main.py index f5bd5ad69..978359a82 100644 --- a/tasks/main.py +++ b/tasks/main.py @@ -77,7 +77,7 @@ def get_tasks_args(parser): if args.task == 'RACE': from race.finetune import main - elif args.task in ['MNLI', 'QQP']: + elif args.task in ['MNLI', 'QQP', 'QNLI', 'SST-2', 'CoLA', 'STS-B', 'MRPC', 'RTE']: from glue.finetune import main elif args.task in ['LAMBADA', 'WIKITEXT103']: from zeroshot_gpt.evaluate import main