From 4cf3f4b19b7a433682526f3781f87cd96e3f5cca Mon Sep 17 00:00:00 2001
From: Joe Mayer <114769929+jomayeri@users.noreply.github.com>
Date: Thu, 3 Nov 2022 10:34:22 -0700
Subject: [PATCH 01/11] Modifying loss checking to support bf16. (#92)

---
 megatron/training.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/megatron/training.py b/megatron/training.py
index 8780582ef..f02df5ba2 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -963,7 +963,10 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
 
         # Logging.
         if args.deepspeed:
-            loss_scale = model[0].optimizer.cur_scale
+            if hasattr(model[0].optimizer, 'cur_scale'):
+                loss_scale = model[0].optimizer.cur_scale
+            else:
+                loss_scale = None
         else:
             loss_scale = optimizer.get_loss_scale().item()
         params_norm = None
@@ -1233,4 +1236,4 @@ def build_train_valid_test_data_iterators(
     else:
         test_data_iterator = None
 
-    return train_data_iterator, valid_data_iterator, test_data_iterator
\ No newline at end of file
+    return train_data_iterator, valid_data_iterator, test_data_iterator

From 238ab6df7e7561884fa2f3993d69099758105946 Mon Sep 17 00:00:00 2001
From: savitamittal1 <39776179+savitamittal1@users.noreply.github.com>
Date: Mon, 7 Nov 2022 10:14:39 -0800
Subject: [PATCH 02/11] Updated to Curated acpt env and removed deepspeed
 install from github (#88)

* updated curated env

* updated to use aml curated env
---
 examples/azureml/Dockerfile.dockerfile | 13 ++-----------
 examples/azureml/aml_submit.py         | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/examples/azureml/Dockerfile.dockerfile b/examples/azureml/Dockerfile.dockerfile
index 2d43612f5..6e70a2d56 100644
--- a/examples/azureml/Dockerfile.dockerfile
+++ b/examples/azureml/Dockerfile.dockerfile
@@ -1,14 +1,5 @@
-FROM mcr.microsoft.com/azureml/aifx/stable-ubuntu2004-cu115-py38-torch1110
-
+FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.5-gpu
 USER root:root
 
 RUN pip install pybind11
-
-RUN pip install git+https://github.com/microsoft/DeepSpeed.git
-
-# add a100-topo.xml
-RUN mkdir -p /opt/microsoft/
-RUN wget -O /opt/microsoft/a100-topo.xml https://hpcbenchmarks.blob.core.windows.net/bookcorpus/data/a100-topo.xml
-
-# to use on A100, enable env var below in your job
-ENV NCCL_TOPO_FILE="/opt/microsoft/a100-topo.xml"
+RUN pip install regex
\ No newline at end of file
diff --git a/examples/azureml/aml_submit.py b/examples/azureml/aml_submit.py
index 58ea6f683..ebfa0a9bf 100644
--- a/examples/azureml/aml_submit.py
+++ b/examples/azureml/aml_submit.py
@@ -8,6 +8,7 @@
 from azureml.core.compute import ComputeTarget, AmlCompute
 from azureml.core.compute_target import ComputeTargetException
 from azureml.core.runconfig import PyTorchConfiguration
+from azureml.core.environment import DockerBuildContext
 
 # Check core SDK version number
 print("SDK version:", azureml.core.VERSION)
@@ -64,15 +65,9 @@
 #-------------------------------------------------------------------------------
 # Setup training environment
 #-------------------------------------------------------------------------------
-megatron_ds_env = Environment.from_dockerfile(name='megatron-ds-ptca', dockerfile='Dockerfile.dockerfile')
-megatron_ds_env.register(ws).build(ws).wait_for_completion()  # Comment this out if environment already exists
 
-megatron_ds_env.environment_variables['NCCL_DEBUG'] = 'WARN'
-megatron_ds_env.environment_variables['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
-megatron_ds_env.environment_variables['NCCL_SOCKET_IFNAME'] = 'eth0'
-megatron_ds_env.environment_variables['NCCL_IB_PCI_RELAXED_ORDERING']='1'
-megatron_ds_env.environment_variables['UCX_TLS']='tcp'
-megatron_ds_env.environment_variables['UCX_NET_DEVICES']='eth0'
+megatron_ds_env = Environment.from_docker_build_context(name='megatron-ds-curated-acpt', docker_build_context=DockerBuildContext.from_local_directory(workspace = ws, path = '.', dockerfile_path='Dockerfile.dockerfile'))
+megatron_ds_env.register(ws).build(ws).wait_for_completion()  # Comment this out if environment already exists
 
 #-------------------------------------------------------------------------------
 # Training Settings and Arguments
@@ -187,6 +182,13 @@
                       environment=megatron_ds_env,
                       distributed_job_config=distr_config)
 
+megatron_ds_src.run_config.environment_variables['NCCL_DEBUG'] = 'WARN'
+megatron_ds_src.run_config.environment_variables['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
+megatron_ds_src.run_config.environment_variables['NCCL_SOCKET_IFNAME'] = 'eth0'
+megatron_ds_src.run_config.environment_variables['NCCL_IB_PCI_RELAXED_ORDERING']='1'
+megatron_ds_src.run_config.environment_variables['UCX_TLS']='tcp'
+megatron_ds_src.run_config.environment_variables['UCX_NET_DEVICES']='eth0'
+
 #-------------------------------------------------------------------------------
 # Submit experiment
 #-------------------------------------------------------------------------------

From c685fb5e4973864ab0d0ad30e55edc014e151ca5 Mon Sep 17 00:00:00 2001
From: Hubert Lu <55214931+hubertlu-tw@users.noreply.github.com>
Date: Fri, 18 Nov 2022 07:20:46 -0800
Subject: [PATCH 03/11] Fix the bug of FusedLayerNorm on ROCm (#96)

---
 .../fused_kernels/layer_norm_cuda_kernel.cu   | 57 ++++++++++++-------
 megatron/model/fused_layer_norm.py            |  7 ++-
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
index efd2ff707..8a07806b1 100644
--- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu
+++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
@@ -76,7 +76,8 @@ void cuWelfordMuSigma2(
   const int i1,
   U& mu,
   U& sigma2,
-  U* buf) 
+  U* buf,
+  const int GPU_WARP_SIZE)
 {
   // Assumptions:
   // 1) blockDim.x == warpSize
@@ -106,12 +107,11 @@ void cuWelfordMuSigma2(
       cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
     }
     // intra-warp reductions
-    for (int l = 0;  l <= 4;  ++l) {
-      int srcLaneB = (threadIdx.x+(1<<l))&31;
-      U muB = WARP_SHFL(mu, srcLaneB);
-      U countB = WARP_SHFL(count, srcLaneB);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
-      cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      U sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      U muB = WARP_SHFL_DOWN(mu, stride);
+      U countB = WARP_SHFL_DOWN(count, stride);
+      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -160,7 +160,8 @@ void cuWelfordMuSigma2(
   const int i1,
   float& mu,
   float& sigma2,
-  float* buf) 
+  float* buf,
+  const int GPU_WARP_SIZE)
 {
   // Assumptions:
   // 1) blockDim.x == warpSize
@@ -201,12 +202,11 @@ void cuWelfordMuSigma2(
       cuWelfordOnlineSum(curr,mu,sigma2,count);
     }
     // intra-warp reductions
-    for (int l = 0;  l <= 4;  ++l) {
-      int srcLaneB = (threadIdx.x+(1<<l))&31;
-      float muB = WARP_SHFL(mu, srcLaneB);
-      float countB = WARP_SHFL(count, srcLaneB);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
-      cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      float sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      float muB = WARP_SHFL_DOWN(mu, stride);
+      float countB = WARP_SHFL_DOWN(count, stride);
+      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -308,7 +308,8 @@ void cuApplyLayerNorm(
   const int n2,
   const U epsilon,
   const V* __restrict__ gamma,
-  const V* __restrict__ beta
+  const V* __restrict__ beta,
+  const int GPU_WARP_SIZE
   ) 
 {
   // Assumptions:
@@ -323,7 +324,7 @@ void cuApplyLayerNorm(
     SharedMemory<U> shared;
     U* buf = shared.getPointer();
     U mu,sigma2;
-    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf);
+    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf,GPU_WARP_SIZE);
     const T* lvals = vals + i1*n2;
     V* ovals = output_vals + i1*n2;
     U c_invvar = rsqrt(sigma2 + epsilon);
@@ -686,7 +687,11 @@ void HostApplyLayerNorm(
     )
 {
     auto stream = at::cuda::getCurrentCUDAStream().stream();
-    const dim3 threads(32,4,1);
+    const int warp_size = at::cuda::warp_size();
+    dim3 threads(warp_size,4,1);
+#ifndef __HIP_PLATFORM_HCC__
+    threads.y = 1;
+#endif
     const uint64_t maxGridY =
       at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
     const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
@@ -701,7 +706,9 @@ void HostApplyLayerNorm(
 		    input,
 		    n1,n2,
 		    U(epsilon),
-            gamma,beta);
+                    gamma,
+		    beta,
+		    warp_size);
 }
 
 
@@ -754,11 +761,16 @@ void HostLayerNormGradient(
     )
 {
     auto stream = at::cuda::getCurrentCUDAStream().stream();
+    const int warp_size = at::cuda::warp_size();
 
     if (gamma != NULL && beta != NULL) {
       // compute grad_gamma(j) and grad_beta(j)
+#ifndef __HIP_PLATFORM_HCC__
+      const int part_size = warp_size;
+#else
       const int part_size = 16;
-      const dim3 threads2(32,4,1);
+#endif
+      const dim3 threads2(warp_size,4,1);
       const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1);
       const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y *
 	(threads2.x + 1);
@@ -777,7 +789,7 @@ void HostLayerNormGradient(
 		      part_grad_gamma.DATA_PTR<U>(),
 		      part_grad_beta.DATA_PTR<U>());
 
-      const dim3 threads3(32,8,1);
+      const dim3 threads3(warp_size,8,1);
       const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
       const int nshared3 = threads3.x * threads3.y * sizeof(U);
       cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
@@ -793,7 +805,10 @@ void HostLayerNormGradient(
     const uint64_t maxGridY =
       at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
     const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
-    const dim3 threads1(32,4,1);
+    dim3 threads1(warp_size,4,1);
+#ifndef __HIP_PLATFORM_HCC__
+    threads1.y = 2;
+#endif
     int nshared =
 	    threads1.y > 1 ?
 	    threads1.y*threads1.x*sizeof(U) :
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 78645c236..f34a05ffe 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -22,6 +22,7 @@
 from torch.nn.parameter import Parameter
 from torch.nn import init
 import importlib
+from torch.nn import functional as F
 
 global fused_mix_prec_layer_norm_cuda
 fused_mix_prec_layer_norm_cuda = None
@@ -84,7 +85,11 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-
+    # CPU path is here for unittest sake.
+    if not input.is_cuda:
+        print("WARNING! The input of FusedLayerNorm should be on the GPU."
+              "This warning should only be triggered in the FusedLayerNorm unit tests.")
+        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
     return FusedLayerNormAffineFunction.apply(
       input, self.weight, self.bias, self.normalized_shape,self.eps)
 

From f7ebcad3f94c090ce0a20d69628229e0f39a1f23 Mon Sep 17 00:00:00 2001
From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com>
Date: Wed, 7 Dec 2022 10:37:43 -0800
Subject: [PATCH 04/11] Fix generate_text.sh Megatron text-generation example
 working w/ DS inference (#99)

This PR gets text-generation working in the `examples/generate_text.sh` example working w/ DS inference enabled. For the main fix, the `sample_sequence_batch` function has been updated to perform the softmax when calculating `log_probs`, instead of setting it to 1's using `torch.ones_like(...)`, although a few minor fixes were applied as well. Extra whitespace is also removed.
---
 examples/generate_text.sh         | 11 ++++++-----
 megatron/checkpointing.py         |  9 +++++----
 megatron/text_generation_utils.py | 12 +++++-------
 tools/generate_samples_gpt.py     | 16 ++++++++--------
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/examples/generate_text.sh b/examples/generate_text.sh
index 63501fc67..7e330ab8d 100755
--- a/examples/generate_text.sh
+++ b/examples/generate_text.sh
@@ -5,7 +5,7 @@ VOCAB_FILE=gpt2-vocab.json
 MERGE_FILE=gpt2-merges.txt
 b=8
 mp=1
-experts=2
+experts=1
 nodes=1
 gpus=1
 
@@ -19,7 +19,7 @@ ds_inference="--ds-inference"
 
 launch_cmd="deepspeed --num_nodes $nodes --num_gpus $gpus"
 L=24
-H=2048
+H=1024
 A=16
 #experts1=${experts[$k]}
 program_cmd="tools/generate_samples_gpt.py \
@@ -33,15 +33,16 @@ program_cmd="tools/generate_samples_gpt.py \
        --num-experts ${experts} \
        --mlp-type standard \
        --micro-batch-size $b \
-       --seq-length 10 \
-       --out-seq-length 10 \
+       --seq-length 1024 \
+       --out-seq-length 1024 \
        --temperature 1.0 \
        --vocab-file $VOCAB_FILE \
        --merge-file $MERGE_FILE \
        --genfile unconditional_samples.json \
        --top_p 0.9 \
        --log-interval 1 \
-       --num-samples $((100*$b))
+       --num-samples 0 \
+       --load $CHECKPOINT_PATH \
        $use_tutel $ds_inference"
 
 echo $launch_cmd $program_cmd
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 8f6b9ec28..52c2a84da 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -137,7 +137,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
                 for i in range(len(model)):
                     mpu.set_virtual_pipeline_model_parallel_rank(i)
                     state_dict['model%d' % i] = model[i].state_dict_for_save_checkpoint()
-            
+
             # Optimizer stuff.
             if not args.no_save_optim:
                 if optimizer is not None:
@@ -169,7 +169,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
 
         # Saving is a collective communication
         checkpoint_name = get_checkpoint_name(args.save, iteration)
-        
+
         # Trim off the filename and mp_rank_* directory.
         for _ in range(3):
             checkpoint_name = os.path.dirname(checkpoint_name)
@@ -201,7 +201,8 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):
     # specific to self attention so should work for cross attention as well
     while hasattr(model, 'module'):
         model = model.module
-    attention_module = model.language_model.encoder.layers[0].self_attention
+    #attention_module = model.language_model.encoder.layers[0].self_attention
+    attention_module = model.language_model.encoder.layers[0].attention
     hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
     num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
     if num_splits_first:
@@ -442,7 +443,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
 def load_biencoder_checkpoint(model, only_query_model=False,
         only_context_model=False, custom_load_path=None):
     """
-    selectively load retrieval models for indexing/retrieving 
+    selectively load retrieval models for indexing/retrieving
     from saved checkpoints
     """
 
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
index 7e81e5fae..adf04bcb4 100644
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -191,7 +191,7 @@ def generate_samples_input_from_file(model):
             context_count += 1
 
 # We added this function to support the tasks evaluation such as squad
-# and drop in the https://github.com/EleutherAI/lm-evaluation-harness 
+# and drop in the https://github.com/EleutherAI/lm-evaluation-harness
 # codebase. The lm-evaluation-harness code can now call this function
 # similar to their current generate function call used for gpt style models.
 def generate_samples_eval(model, context, max_gen_length, eos_token_id):
@@ -218,7 +218,7 @@ def generate_samples_eval(model, context, max_gen_length, eos_token_id):
     decode_tokens = decode_tokens[0].cpu().numpy().tolist()
     trim_decode_tokens = tokenizer.detokenize(
         decode_tokens)[raw_text_len:]
- 
+
     return trim_decode_tokens
 
 
@@ -416,9 +416,9 @@ def get_token_stream(model, context_tokens, model_latencies=[], single_token_lat
     batch_token_iterator = sample_sequence_batch(model, context_tokens_tensor,
                                                  context_length_tensor,
                                                  attention_mask, position_ids, model_latencies=model_latencies)
-    
+
     count = 0
-    
+
     t0=time.time()
     for tokens, lengths in batch_token_iterator:
         if count > 1:
@@ -559,9 +559,7 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
                     logits /= args.temperature
                     logits = top_k_logits(logits, top_k=args.top_k,
                                           top_p=args.top_p)
-                    log_probs = torch.ones_like(logits) 
-                    #TODO: Fix this
-                    #log_probs = F.softmax(logits, dim=-1)
+                    log_probs = F.softmax(logits, dim=-1)
                     prev = torch.multinomial(log_probs, num_samples=1).view(-1)
 
                 started = context_lengths <= context_length
diff --git a/tools/generate_samples_gpt.py b/tools/generate_samples_gpt.py
index 5df2c698e..bbd1164c4 100644
--- a/tools/generate_samples_gpt.py
+++ b/tools/generate_samples_gpt.py
@@ -41,7 +41,7 @@ def model_provider(pre_process=True, post_process=True):
 
     print_rank_0('building GPT model ...')
     model = GPTModel(num_tokentypes=0, parallel_output=False,
-                     pre_process=pre_process, post_process=post_process, 
+                     pre_process=pre_process, post_process=post_process,
                      return_moe_loss=False) # we need to set "return_moe_loss" for the inference_mode
     return model
 
@@ -102,7 +102,7 @@ def print_latency(latency_set, title=""):
         print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
         print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
         print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
-        
+
 def main():
     """Main program."""
     latencies = []
@@ -115,7 +115,7 @@ def main():
                                        'no_load_optim': True})
 
     args = get_args()
-    
+
     if args.num_layers_per_virtual_pipeline_stage is not None:
         print("Interleaved pipeline schedule is not yet supported for text generation.")
         exit()
@@ -142,8 +142,8 @@ def main():
             generate_samples_interactive(model)
     else:
         generate_and_write_samples_unconditional(model, latencies, single_token_latency, model_latencies)
-    
-    
+
+
     #if torch.cuda.current_device() == 0:
     if torch.distributed.get_rank() == 0:
         print_latency(latencies)
@@ -154,13 +154,13 @@ def main():
 def ds_inference(model, args):
     import megatron.model as mm
     engine = deepspeed.init_inference(model=model,
-                                      mp_size=args.tensor_model_parallel_size, 
-                                      mpu=mpu,
+                                      mp_size=args.tensor_model_parallel_size,
+                                      tensor_parallel={"mpu": mpu},
                                       dtype=torch.half,
                                       replace_with_kernel_inject=True,
                                       moe_experts=args.num_experts,
                                       moe_type=args.mlp_type)
-    
+
     return engine.module
 
 if __name__ == "__main__":

From 5e8d578483d2f5ea963eec1d3ad1e797117b6706 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Mon, 12 Dec 2022 15:15:04 -0800
Subject: [PATCH 05/11] DeepSpeed Data Efficiency Library pretraining examples
 (#100)

* staging_data_efficiency_v1 (#12)

* refactor and clean

* script refactor

* fix

* fix

* fix

* fix

* refactor

* script

* CL diff type

* script cleanup

* fix for MP

* refactor

* refactor

* fix

* apply feedback
---
 examples/README.md                            |   5 +
 examples/bert_with_pile/prepare_pile_data.py  |   4 +-
 examples/data_efficiency/README.md            |  23 +
 examples/data_efficiency/analyze_data.py      | 239 ++++++++
 .../bert/ds_analyze_bert_data_map.sh          |  67 +++
 .../bert/ds_analyze_bert_data_reduce.sh       |  66 +++
 .../finetune/ds_config_bert_TEMPLATE.json     |  24 +
 .../bert/finetune/ds_finetune_bert_mnli.sh    | 150 +++++
 .../bert/finetune/ds_finetune_bert_qqp.sh     | 158 ++++++
 .../bert/finetune/ds_finetune_bert_race.sh    | 172 ++++++
 .../finetune/ds_finetune_gather_result.py     | 111 ++++
 .../bert/pile_data_download_preprocess.py     | 129 +++++
 .../ds_config_bert_1clmetric_TEMPLATE.json    |  74 +++
 .../ds_config_bert_2clmetrics_TEMPLATE.json   |  88 +++
 .../ds_pretrain_bert_336M_base_script.sh      | 472 ++++++++++++++++
 .../pretrain/ds_pretrain_bert_336M_run.sh     | 241 ++++++++
 .../gpt/ds_analyze_gpt_data_map.sh            |  70 +++
 .../gpt/ds_analyze_gpt_data_reduce.sh         |  69 +++
 .../gpt/eval/ds_config_eval_dummy.json        |  28 +
 .../gpt/eval/ds_evalharness_1gpu.sh           |  77 +++
 .../gpt/eval/ds_evalharness_gather_result.py  | 358 ++++++++++++
 .../gpt/eval/ds_evalharness_parallel_run.sh   |  66 +++
 .../ds_evalharness_parallel_run_10shot.sh     |  61 +++
 .../ds_config_gpt_1clmetric_TEMPLATE.json     |  74 +++
 .../ds_config_gpt_2clmetrics_TEMPLATE.json    |  88 +++
 .../ds_pretrain_gpt_1.3B_dense_base_script.sh | 515 ++++++++++++++++++
 .../ds_pretrain_gpt_1.3B_dense_run.sh         | 366 +++++++++++++
 megatron/arguments.py                         |  21 +-
 megatron/data/bert_dataset.py                 |   6 +-
 megatron/data/dataset_utils.py                |  49 +-
 megatron/data/gpt_dataset.py                  |  23 +-
 megatron/initialize.py                        |  11 +-
 megatron/model/gpt_model.py                   |   2 +-
 megatron/model/language_model.py              |  11 +-
 megatron/model/transformer.py                 |   6 +-
 megatron/model/utils.py                       |   2 +-
 megatron/training.py                          | 151 ++++-
 megatron/utils.py                             |   5 +-
 pretrain_bert.py                              |  32 +-
 pretrain_gpt.py                               |  42 +-
 tasks/eval_harness/evaluate.py                |   5 +-
 41 files changed, 4087 insertions(+), 74 deletions(-)
 create mode 100644 examples/data_efficiency/README.md
 create mode 100644 examples/data_efficiency/analyze_data.py
 create mode 100644 examples/data_efficiency/bert/ds_analyze_bert_data_map.sh
 create mode 100644 examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh
 create mode 100644 examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json
 create mode 100644 examples/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh
 create mode 100644 examples/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh
 create mode 100644 examples/data_efficiency/bert/finetune/ds_finetune_bert_race.sh
 create mode 100644 examples/data_efficiency/bert/finetune/ds_finetune_gather_result.py
 create mode 100644 examples/data_efficiency/bert/pile_data_download_preprocess.py
 create mode 100644 examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json
 create mode 100644 examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json
 create mode 100644 examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh
 create mode 100644 examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
 create mode 100644 examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh
 create mode 100644 examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh
 create mode 100644 examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json
 create mode 100644 examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh
 create mode 100644 examples/data_efficiency/gpt/eval/ds_evalharness_gather_result.py
 create mode 100644 examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh
 create mode 100644 examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh
 create mode 100644 examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json
 create mode 100644 examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json
 create mode 100644 examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh
 create mode 100644 examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh

diff --git a/examples/README.md b/examples/README.md
index 80b1d4488..a130a5312 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -12,9 +12,14 @@ If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS bas
 
 Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models and dense models. These recipes are for GPT-style NLG models.
 
+### Data Efficiency 
+
+The ```data_efficiency``` folder includes GPT-3 and BERT pretraining examples for DeepSpeed Data Efficiency Library. Please refer to the detailed tutorials in data_efficiency/README.MD.
+
 ### Curriculum Learning
 
 Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. These recipes are for GPT-style NLG models.
+Note that the DeepSpeed Data Efficiency Library above includes a more general curriculum learning support. This legacy curriculum learning feature is still compatible, but we recommend using the DeepSpeed Data Efficiency Library above.
 
 ### Model Compression
 
diff --git a/examples/bert_with_pile/prepare_pile_data.py b/examples/bert_with_pile/prepare_pile_data.py
index d5eed96d5..953d5966d 100644
--- a/examples/bert_with_pile/prepare_pile_data.py
+++ b/examples/bert_with_pile/prepare_pile_data.py
@@ -2,9 +2,9 @@
 import sys
 import time
 import os
-
 import sys
-sys.path.insert(1, '../../')
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir,os.path.pardir)))
 from megatron.data import indexed_dataset
 
 def pile_download(download_url, file_path, i):
diff --git a/examples/data_efficiency/README.md b/examples/data_efficiency/README.md
new file mode 100644
index 000000000..e10db17d5
--- /dev/null
+++ b/examples/data_efficiency/README.md
@@ -0,0 +1,23 @@
+This directory includes GPT-3/BERT pretraining example scripts for DeepSpeed Data Efficiency Library technologies (curriculum learning, random-LTD, and the two composed together).
+
+You need to install updated DeepSpeed version (>=0.8.0), which contains the DeepSpeed Data Efficiency Library.
+
+Additional tutorial can be found at [DeepSpeed website](https://www.deepspeed.ai/tutorials/data-efficiency/).
+
+Additional technical details can be found in our [random-LTD paper](https://arxiv.org/abs/2211.11586) and [data efficiency paper](https://arxiv.org/abs/2212.03597).
+
+## GPT-3 pretraining and evaluation
+Inside ``gpt`` folder, first the ``ds_analyze_gpt_data_map.sh`` and ``ds_analyze_gpt_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
+
+``gpt/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_gpt_1.3B_dense_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
+
+``gpt/eval`` includes the zero-/few-shot evaluation example scripts. ``ds_evalharness_parallel_run.sh`` is for zero-shot, and ``ds_evalharness_parallel_run_10shot.sh`` is for 10-shot.
+
+## BERT pretraining and finetuning
+Inside ``bert`` folder, first the ``pile_data_download_preprocess.py`` can be used to download and preprocess the public Pile dataset.
+
+The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are used for curriculum learning's offline data analysis and indexing.
+
+``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
+
+``bert/finetune`` includes the finetuning example scripts.
\ No newline at end of file
diff --git a/examples/data_efficiency/analyze_data.py b/examples/data_efficiency/analyze_data.py
new file mode 100644
index 000000000..36002d46d
--- /dev/null
+++ b/examples/data_efficiency/analyze_data.py
@@ -0,0 +1,239 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+import os
+import time
+import sys
+import math
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir,os.path.pardir)))
+from datetime import datetime
+import numpy as np
+import torch
+
+from deepspeed.runtime.data_pipeline.data_sampling.data_analyzer \
+    import DataAnalyzer
+from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset \
+    import MMapIndexedDataset
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron.initialize import initialize_megatron
+
+def get_tasks_args(parser):
+    """Provide extra arguments required for data analyzing."""
+    group = parser.add_argument_group(title='data_analyzing')
+
+    group.add_argument('--analyzing-task', type=str, required=True,
+                       default=None,
+                       choices=['map',
+                                'reduce'],
+                       help='What type of analyzing task to perform.')
+    group.add_argument('--analyzing-data-type', type=str, required=True,
+                       default=None,
+                       choices=['BERT',
+                                'GPT'],
+                       help='What type of data.')
+    group.add_argument('--analyzing-metric', type=str, nargs='+', default=[],
+                       help='What kinds of metrics to analyze.')
+    group.add_argument('--analyzing-num-workers', type=int, default=1,
+                       help='Number of workers. Each worker could be a single CPU node.')
+    group.add_argument('--analyzing-worker-id', type=int, default=0,
+                       help='Worker id of current node.')
+    group.add_argument('--analyzing-num-threads', type=int, default=1,
+                       help='Number of threads for each worker.')
+    group.add_argument('--analyzing-num-threads-reduce', type=int, default=1,
+                       help='Number of threads for each worker.')
+    group.add_argument('--analyzing-specific-threads', type=int, nargs='+', default=[],
+                       help='Which specific threads to run. Helpful when there are specific thread failed in previous run.')
+    return parser
+
+def train_valid_test_datasets_provider_gpt():
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for GPT ...')
+    from megatron.data.gpt_dataset import build_train_valid_test_datasets
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them
+        seq_length=args.seq_length,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup))
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+def train_valid_test_datasets_provider_bert():
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for BERT ...')
+    from megatron.data.dataset_utils import build_train_valid_test_datasets
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=[1,1,1], # Just dummy numbers since we assume args.train_data_exact_num_epochs will override them
+        max_seq_length=args.seq_length,
+        masked_lm_prob=args.mask_prob,
+        short_seq_prob=args.short_seq_prob,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup),
+        binary_head=args.bert_binary_head)
+    print_rank_0("> finished creating BERT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+def metric_seqlen(data):
+    metric = torch.count_nonzero(data['padding_mask'], dim=1)
+    return metric
+
+def metric_total_vocab_freq(data):
+    args = get_args()
+    if args.analyzing_data_type == 'BERT':
+        frequency = torch.bincount(data['text'].view(-1),
+            minlength=args.padded_vocab_size+1,
+            weights=data['padding_mask'].view(-1))
+    elif args.analyzing_data_type == 'GPT':
+        frequency = torch.bincount(data['text'].view(-1),
+            minlength=args.padded_vocab_size+1)
+    return frequency
+
+def metric_vocab_rarity(data):
+    args = get_args()
+    if args.analyzing_data_type == 'BERT':
+        rarity = torch.sum(data['padding_mask'] * \
+            args.total_vocab_freq[data['text']], dim=1).to(torch.long)
+    elif args.analyzing_data_type == 'GPT':
+        rarity = []
+        # Do one by one to avoid too high memory consumption
+        for row in range(data['text'].size()[0]):
+            rarity.append(int(torch.sum(args.total_vocab_freq[data['text'][row]]).item()))
+        rarity = torch.tensor(rarity, dtype=torch.long)
+    print(f"rarity min {min(rarity)}, max {max(rarity)}, len {len(rarity)}, avg {sum(rarity)/len(rarity)}")
+    return rarity
+
+def metric_seqlen_vocab_rarity(data):
+    args = get_args()
+    metric = torch.count_nonzero(data['padding_mask'], dim=1).to(torch.long) * args.seqlen_coeff
+    metric += torch.sum(data['padding_mask'] * \
+        args.total_vocab_freq[data['text']], dim=1).to(torch.long)
+    print(f"metric min {min(metric)}, max {max(metric)}, len {len(metric)}, avg {sum(metric)/len(metric)}")
+    return metric
+
+def get_metric_function(metric_name):
+    if metric_name == 'seqlen':
+        return metric_seqlen
+    if metric_name == 'total_vocab_freq':
+        return metric_total_vocab_freq
+    if metric_name == 'vocab_rarity':
+        return metric_vocab_rarity
+    if metric_name == 'seqlen_vocab_rarity':
+        return metric_seqlen_vocab_rarity
+
+def get_metric_type(metric_name):
+    if metric_name == 'seqlen':
+        return 'single_value_per_sample'
+    if metric_name == 'total_vocab_freq':
+        return 'accumulate_value_over_samples'
+    if metric_name == 'vocab_rarity':
+        return 'single_value_per_sample'
+    if metric_name == 'seqlen_vocab_rarity':
+        return 'single_value_per_sample'
+
+def run_map():
+    args = get_args()
+    if args.analyzing_data_type == 'BERT':
+        args.mask_prob = 0 # When analyzing data, we don't want any mask.
+        train_ds, _, _ = train_valid_test_datasets_provider_bert()
+    elif args.analyzing_data_type == 'GPT':
+        train_ds, _, _ = train_valid_test_datasets_provider_gpt()
+        assert 'seqlen' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.'
+        assert 'seqlen_vocab_rarity' not in args.analyzing_metric, 'GPT data has fixed seqlen, thus unnecessary to analyze seqlen metric.'
+    if 'vocab_rarity' in args.analyzing_metric or 'seqlen_vocab_rarity' in args.analyzing_metric:
+        total_vocab_freq_fname = f"{args.save}/total_vocab_freq/total_vocab_freq_metric_value"
+        assert os.path.isfile(f"{total_vocab_freq_fname}.bin") and os.path.isfile(f"{total_vocab_freq_fname}.idx"), "To analyze vocab rarity, first need to analyze the total vocab freq."
+        total_vocab_freq = MMapIndexedDataset(total_vocab_freq_fname, skip_warmup=True)
+        total_vocab_freq = np.copy(total_vocab_freq[0])
+        total_vocab_freq[total_vocab_freq == 0] = 1 # Avoid log(0) error
+        total_vocab_freq = np.log(total_vocab_freq/sum(total_vocab_freq)) * -1
+        args.total_vocab_freq = torch.tensor(total_vocab_freq, dtype=torch.double)
+        if 'seqlen_vocab_rarity' in args.analyzing_metric:
+            # Use large coeff to make seqlen dominates vocab_rarity
+            max_possible_rarity = args.seq_length * torch.max(args.total_vocab_freq).item()
+            args.seqlen_coeff = 10 ** (math.ceil(math.log(max_possible_rarity, 10)) + 1)
+            print(f"Metric seqlen_vocab_rarity: using {args.seqlen_coeff} as coefficient for seqlen.")
+    metric_functions = [get_metric_function(x) for x in args.analyzing_metric]
+    metric_types = [get_metric_type(x) for x in args.analyzing_metric]
+    # For metric_dtypes we int64 by default since it could be hard to estimate
+    # the appropriate dtype before the mapping analysis. During reduce where
+    # we merge the analysis results, the DataAnalyzer will automatically choose
+    # the dtype of merged result file as the smallest one that meet the range
+    # requirement.
+    metric_dtypes = [np.int64 for x in args.analyzing_metric]
+    start = time.time()
+    data_analyzer = DataAnalyzer(train_ds,
+        num_workers=args.analyzing_num_workers,
+        worker_id=args.analyzing_worker_id,
+        num_threads=args.analyzing_num_threads,
+        specific_threads=args.analyzing_specific_threads,
+        batch_size=args.global_batch_size, metric_names=args.analyzing_metric,
+        metric_functions=metric_functions, metric_types=metric_types,
+        metric_dtypes=metric_dtypes, save_path=args.save)
+    data_analyzer.run_map()
+    duration = (time.time() - start) / 3600.0
+    print(f"map job finished in {duration} hr.")
+
+def run_reduce():
+    args = get_args()
+    if args.analyzing_data_type == 'BERT':
+        args.mask_prob = 0 # When analyzing data, we don't want any mask.
+        train_ds, _, _ = train_valid_test_datasets_provider_bert()
+    elif args.analyzing_data_type == 'GPT':
+        train_ds, _, _ = train_valid_test_datasets_provider_gpt()
+    metric_functions = [get_metric_function(x) for x in args.analyzing_metric]
+    metric_types = [get_metric_type(x) for x in args.analyzing_metric]
+    metric_dtypes = [np.int64 for x in args.analyzing_metric]
+    start = time.time()
+    data_analyzer = DataAnalyzer(train_ds,
+        num_workers=args.analyzing_num_workers,
+        num_threads=args.analyzing_num_threads,
+        num_threads_reduce=args.analyzing_num_threads_reduce,
+        batch_size=args.global_batch_size, metric_names=args.analyzing_metric,
+        metric_functions=metric_functions, metric_types=metric_types,
+        metric_dtypes=metric_dtypes, save_path=args.save)
+    data_analyzer.run_reduce()
+    duration = (time.time() - start) / 3600.0
+    print(f"reduce job finished in {duration} hr.")
+
+if __name__ == "__main__":
+    initialize_megatron(extra_args_provider=get_tasks_args, allow_no_cuda=True)
+    args = get_args()
+    if args.analyzing_task == 'map':
+        run_map()
+    elif args.analyzing_task == 'reduce':
+        run_reduce()
+    else:
+        raise NotImplementedError('Task {} is not implemented.'.format(
+            args.analyzing_task))
diff --git a/examples/data_efficiency/bert/ds_analyze_bert_data_map.sh b/examples/data_efficiency/bert/ds_analyze_bert_data_map.sh
new file mode 100644
index 000000000..7f23e3615
--- /dev/null
+++ b/examples/data_efficiency/bert/ds_analyze_bert_data_map.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+num_workers=1 # Num nodes to run the map job
+num_threads=40 # Num threads on each node. Set this based on #CPU cores
+
+# If different data epochs have slightly different data samples (e.g., due
+# to randomness), then you need to specify large enough num_epochs that cover
+# whole pretraining. If different data epochs are the same, set num_epochs to
+# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
+# library will automatically handle reshuffling when reaching another epoch.
+num_epochs=5
+
+# Which node is this node (start with 0 and end with num_workers-1). This
+# script only launch the map job on 1 worker node, since we don't expect
+# running on many nodes and workers don't need any communication. But you
+# can modify this script to add a MPI/torch distributed launcher.
+worker_id=$1
+save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
+
+metric='total_vocab_freq'
+# metric='vocab_rarity' # this requires the result of total_vocab_freq
+# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
+# metric='seqlen'
+
+seq_len=512
+batch_size=10000
+
+jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}"
+## Public the Pile dataset, see prepare_pile_data.py in the same directory
+## about how to download and preprocess the data.
+## Change data_home to your own training data path.
+# data_home="/vc_data_blob/users/conglli/the_pile_bert"
+data_home="/blob/data/the_pile_bert"
+data_path="${data_home}/pile_bert_train_text_sentence"
+
+vocab_path="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+# Make sure the "--split" is the same as what you will use for pre-training.
+options=" \
+    --analyzing-task map \
+    --analyzing-data-type BERT \
+    --analyzing-metric ${metric} \
+    --analyzing-num-workers ${num_workers} \
+    --analyzing-worker-id ${worker_id} \
+    --analyzing-num-threads ${num_threads} \
+    --vocab-file ${vocab_path} \
+    --data-path ${data_path} \
+    --data-impl mmap \
+    --tokenizer-type BertWordPieceLowerCase \
+    --micro-batch-size ${batch_size} \
+    --global-batch-size ${batch_size} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --num-layers 1 \
+    --hidden-size 1 \
+    --num-attention-heads 1 \
+    --split 949,50,1 \
+    --distributed-backend gloo \
+    --train-data-exact-num-epochs ${num_epochs} \
+    --return-data-index \
+    --save-interval 1 \
+    --save ${save_path}"
+
+python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
diff --git a/examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh b/examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh
new file mode 100644
index 000000000..f0d14df96
--- /dev/null
+++ b/examples/data_efficiency/bert/ds_analyze_bert_data_reduce.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Set these 2 to the same as what you used during map job. We need these 2
+# configs to know how many map job result files do we have.
+num_workers=1
+num_threads=40
+# Reduce job only has 1 worker but can accelerate by multithreading.
+num_threads_reduce=40
+
+# If different data epochs have slightly different data samples (e.g., due
+# to randomness), then you need to specify large enough num_epochs that cover
+# whole pretraining. If different data epochs are the same, set num_epochs to
+# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
+# library will automatically handle reshuffling when reaching another epoch.
+num_epochs=5
+
+save_path="/blob/users/conglli/data/analysis_pile_bert_${num_epochs}epoch/"
+
+metric='total_vocab_freq'
+# metric='vocab_rarity' # this requires the result of total_vocab_freq
+# metric='seqlen_vocab_rarity' # this requires the result of total_vocab_freq
+# metric='seqlen'
+
+seq_len=512
+batch_size=10000
+
+jobname="bert-pile-analyzing-${metric}-${num_epochs}epoch-reduce"
+## Public the Pile dataset, see prepare_pile_data.py in the same directory
+## about how to download and preprocess the data.
+## Change data_home to your own training data path.
+# data_home="/vc_data_blob/users/conglli/the_pile_bert"
+data_home="/blob/data/the_pile_bert"
+data_path="${data_home}/pile_bert_train_text_sentence"
+
+vocab_path="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+# Make sure the "--split" is the same as what you will use for pre-training.
+options=" \
+    --analyzing-task reduce \
+    --analyzing-data-type BERT \
+    --analyzing-metric ${metric} \
+    --analyzing-num-workers ${num_workers} \
+    --analyzing-num-threads ${num_threads} \
+    --analyzing-num-threads-reduce ${num_threads_reduce} \
+    --vocab-file ${vocab_path} \
+    --data-path ${data_path} \
+    --data-impl mmap \
+    --tokenizer-type BertWordPieceLowerCase \
+    --micro-batch-size ${batch_size} \
+    --global-batch-size ${batch_size} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --num-layers 1 \
+    --hidden-size 1 \
+    --num-attention-heads 1 \
+    --split 949,50,1 \
+    --distributed-backend gloo \
+    --train-data-exact-num-epochs ${num_epochs} \
+    --return-data-index \
+    --save-interval 1 \
+    --save ${save_path}"
+
+python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
diff --git a/examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json b/examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json
new file mode 100644
index 000000000..2700805d1
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune/ds_config_bert_TEMPLATE.json
@@ -0,0 +1,24 @@
+{
+  "train_batch_size" : CONFIG_BATCH_SIZE,
+  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE,
+    "elastic_checkpoint": true
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false
+}
diff --git a/examples/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh b/examples/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh
new file mode 100644
index 000000000..e88f7beb0
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune/ds_finetune_bert_mnli.sh
@@ -0,0 +1,150 @@
+seed=1234
+pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+###############################################################################
+### Main configs
+### The main configs are from Megatron-LM paper
+### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
+### or build your own configs.
+seq_len=512
+
+## From Table 6 in https://arxiv.org/abs/1909.08053.
+task="MNLI"
+global_batch_size=128
+lr=1e-5
+epochs=10
+
+train_data="/blob/data/GlueData/MNLI/train.tsv"
+valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
+            /blob/data/GlueData/MNLI/dev_mismatched.tsv"
+
+## Adjust based on number of GPUs.
+batch_size=16
+
+## BERT 110M (same config as original BERT-Base model)
+## This config is not included in Megatron-LM paper
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+
+## BERT 336M (same config as original BERT-Large model)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO stage
+zero_stage=0
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=50
+eval_interval=100
+save_interval=500000
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+###############################################################################
+vocab_file="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
+checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
+mkdir -p ${checkpoint_path}
+
+template_json="ds_config_bert_TEMPLATE.json"
+config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
+if [[ $zero_stage -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+fi
+
+options=" \
+    --finetune \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --task ${task} \
+    --seed ${seed} \
+    --train-data ${train_data} \
+    --valid-data ${valid_data} \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${vocab_file} \
+    --epochs ${epochs} \
+    --pretrained-checkpoint ${pretrained_checkpoint} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --global-batch-size ${global_batch_size} \
+    --micro-batch-size ${batch_size} \
+    --lr ${lr} \
+    --lr-decay-style linear \
+    --lr-warmup-fraction 0.065 \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --save-interval ${save_interval} \
+    --save ${checkpoint_path} \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --weight-decay 1.0e-1 \
+    --fp16"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+options="${options} \
+    --checkpoint-activations \
+    --deepspeed-activation-checkpointing"
+fi
+
+if [[ "${no_pp}" = "true" ]]; then
+options="${options} \
+    --no-pipeline-parallel"
+fi
+
+# After the fine-tuning finishes, you can find the dev set accuracy numbers by
+# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
+deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/examples/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh b/examples/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh
new file mode 100644
index 000000000..8083e1024
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune/ds_finetune_bert_qqp.sh
@@ -0,0 +1,158 @@
+seed=1234
+pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+###############################################################################
+### Main configs
+### The main configs are from Megatron-LM paper
+### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
+### or build your own configs.
+seq_len=512
+
+## From Table 6 in https://arxiv.org/abs/1909.08053.
+task="QQP"
+
+train_data="/blob/data/GlueData/QQP/train.tsv"
+valid_data="/blob/data/GlueData/QQP/dev.tsv"
+
+## Adjust based on number of GPUs.
+batch_size=16
+
+## BERT 110M (same config as original BERT-Base model)
+## This config is not included in Megatron-LM paper
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+# global_batch_size=128
+# lr=5e-5
+# epochs=12
+
+## BERT 336M (same config as original BERT-Large model)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+global_batch_size=128
+lr=5e-5
+epochs=12
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+# global_batch_size=128
+# lr=3e-5
+# epochs=12
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+# global_batch_size=256
+# lr=4e-5
+# epochs=12
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO stage
+zero_stage=0
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=50
+eval_interval=100
+save_interval=500000
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+###############################################################################
+vocab_file="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
+checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
+mkdir -p ${checkpoint_path}
+
+template_json="ds_config_bert_TEMPLATE.json"
+config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
+if [[ $zero_stage -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+fi
+
+options=" \
+    --finetune \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --task ${task} \
+    --seed ${seed} \
+    --train-data ${train_data} \
+    --valid-data ${valid_data} \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${vocab_file} \
+    --epochs ${epochs} \
+    --pretrained-checkpoint ${pretrained_checkpoint} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --global-batch-size ${global_batch_size} \
+    --micro-batch-size ${batch_size} \
+    --lr ${lr} \
+    --lr-decay-style linear \
+    --lr-warmup-fraction 0.065 \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --save-interval ${save_interval} \
+    --save ${checkpoint_path} \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --weight-decay 1.0e-1 \
+    --fp16"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+options="${options} \
+    --checkpoint-activations \
+    --deepspeed-activation-checkpointing"
+fi
+
+if [[ "${no_pp}" = "true" ]]; then
+options="${options} \
+    --no-pipeline-parallel"
+fi
+
+# After the fine-tuning finishes, you can find the dev set accuracy numbers by
+# "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
+deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/examples/data_efficiency/bert/finetune/ds_finetune_bert_race.sh b/examples/data_efficiency/bert/finetune/ds_finetune_bert_race.sh
new file mode 100644
index 000000000..15658e3d2
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune/ds_finetune_bert_race.sh
@@ -0,0 +1,172 @@
+seed=1234
+## RACE have two sub-tasks that need to be finetuned separately
+difficulty="middle"
+# difficulty="high"
+pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+###############################################################################
+### Main configs
+### The main configs are from Megatron-LM paper
+### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
+### or build your own configs.
+seq_len=512
+
+## From Table 6 in https://arxiv.org/abs/1909.08053.
+task="RACE"
+
+## Race dataset can be downloaded by:
+## wget http://www.cs.cmu.edu/~glai1/data/race/RACE.tar.gz
+train_data="/blob/data/RACE/train/${difficulty}"
+
+## The Megatron paper https://arxiv.org/abs/1909.08053 says: "For the test set
+## results of RACE, we first use the development set to find the checkpoint
+## that gives us the median score on the 5 random seeds and we report the
+## results from that checkpoint on the test set", which is a quite confusing
+## description. For simplicity, instead we directly get the median dev and test
+## set score on 5 random seeds from a single pretrained_checkpoint.
+valid_data="/blob/data/RACE/dev/${difficulty} \
+            /blob/data/RACE/test/${difficulty}"
+
+## Adjust based on number of GPUs.
+batch_size=4
+
+## BERT 110M (same config as original BERT-Base model)
+## This config is not included in Megatron-LM paper
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+# global_batch_size=32
+# lr=2e-5
+# epochs=3
+
+## BERT 336M (same config as original BERT-Large model)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+global_batch_size=32
+lr=2e-5
+epochs=3
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+# global_batch_size=16
+# lr=1e-5
+# epochs=3
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+# global_batch_size=32
+# lr=2e-5
+# epochs=3
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO stage
+zero_stage=0
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=50
+eval_interval=100
+save_interval=100000
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+###############################################################################
+vocab_file="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+jobname="${task}-${difficulty}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
+checkpoint_path="${pretrained_checkpoint}-finetune/${jobname}"
+mkdir -p ${checkpoint_path}
+
+template_json="ds_config_bert_TEMPLATE.json"
+config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
+if [[ $zero_stage -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+fi
+
+options=" \
+    --finetune \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --task ${task} \
+    --seed ${seed} \
+    --train-data ${train_data} \
+    --valid-data ${valid_data} \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${vocab_file} \
+    --epochs ${epochs} \
+    --pretrained-checkpoint ${pretrained_checkpoint} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --global-batch-size ${global_batch_size} \
+    --micro-batch-size ${batch_size} \
+    --lr ${lr} \
+    --lr-decay-style linear \
+    --lr-warmup-fraction 0.06 \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --save-interval ${save_interval} \
+    --save ${checkpoint_path} \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --weight-decay 1.0e-1 \
+    --clip-grad 1.0 \
+    --fp16"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+options="${options} \
+    --checkpoint-activations \
+    --deepspeed-activation-checkpointing"
+fi
+
+if [[ "${no_pp}" = "true" ]]; then
+options="${options} \
+    --no-pipeline-parallel"
+fi
+
+# After the fine-tuning finishes, you can find the dev/test set accuracy numbers
+# by "grep -e "overall:" -e "metrics for" ${checkpoint_path}/output.log"
+deepspeed ../../../../tasks/main.py ${options} &> ${checkpoint_path}/output.log
diff --git a/examples/data_efficiency/bert/finetune/ds_finetune_gather_result.py b/examples/data_efficiency/bert/finetune/ds_finetune_gather_result.py
new file mode 100644
index 000000000..6fffe829d
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune/ds_finetune_gather_result.py
@@ -0,0 +1,111 @@
+import os
+import statistics
+
+def gather_numbers(fname, match_keywords, index_keywords, index_offsets):
+    results = {}
+    for k in index_keywords:
+        results[k] = []
+    file1 = open(fname, 'r')
+    while True:
+        line = file1.readline()
+        if not line:
+            break
+        splits = line.split(' ')
+        for i in range(len(match_keywords)):
+            if match_keywords[i] in line:
+                ref_idx = splits.index(index_keywords[i])
+                results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]]))
+    file1.close()
+    return results
+
+def gather_MNLI_results(result_path):
+    overall = []
+    matched = []
+    mismatched = []
+    for file in os.listdir(result_path):
+        if file.startswith('MNLI'):
+            fname = f'{result_path}/{file}/output.log'
+            if os.path.exists(fname):
+                results = gather_numbers(fname,
+                    ['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'],
+                    ['overall:', 'dev-matched:', 'dev-mismatched:'],
+                    [9, 9, 9])
+                overall_candidate = results['overall:']
+                matched_candidate = results['dev-matched:']
+                mismatched_candidate = results['dev-mismatched:']
+                if len(overall_candidate) > 0:
+                    assert len(overall_candidate) == len(matched_candidate) and len(overall_candidate) == len(mismatched_candidate)
+                    best_index = overall_candidate.index(max(overall_candidate))
+                    overall.append(overall_candidate[best_index])
+                    matched.append(matched_candidate[best_index])
+                    mismatched.append(mismatched_candidate[best_index])
+    if len(overall) > 0:
+        if len(overall) % 2 == 1:
+            median_idx = overall.index(statistics.median(overall))
+        else:
+            median_idx = overall.index(statistics.median_high(overall))
+        print(f'MNLI how Megatron paper reported: overall results median {statistics.median(overall)}, corresponding matched/mismatched: {matched[median_idx]}/{mismatched[median_idx]}')
+        print(f'MNLI other results:')
+        print(f'MNLI overall results {overall}, median {statistics.median(overall)} (corresponding matched/mismatched {matched[median_idx]}/{mismatched[median_idx]}), mean {statistics.mean(overall)}, std {statistics.stdev(overall)}')
+        print(f'MNLI matched results {matched}, median {statistics.median(matched)}, mean {statistics.mean(matched)}, std {statistics.stdev(matched)}')
+        print(f'MNLI mismatched results {mismatched}, median {statistics.median(mismatched)}, mean {statistics.mean(mismatched)}, std {statistics.stdev(mismatched)}')
+    else:
+        print("Didn't find any MNLI result")
+
+def gather_QQP_results(result_path):
+    overall = []
+    for file in os.listdir(result_path):
+        if file.startswith('QQP'):
+            fname = f'{result_path}/{file}/output.log'
+            if os.path.exists(fname):
+                results = gather_numbers(fname, ['overall:'], ['overall:'], [9])
+                overall_candidate = results['overall:']
+                if len(overall_candidate) > 0:
+                    best_index = overall_candidate.index(max(overall_candidate))
+                    overall.append(overall_candidate[best_index])
+    if len(overall) > 0:
+        print(f'QQP how Megatron paper reported: overall results median {statistics.median(overall)}')
+        print(f'QQP other results:')
+        print(f'QQP overall results {overall}, median {statistics.median(overall)}, mean {statistics.mean(overall)}, std {statistics.stdev(overall)}')
+    else:
+        print("Didn't find any QQP result")
+
+def gather_RACE_results(result_path, task):
+    dev = []
+    test = []
+    for file in os.listdir(result_path):
+        if file.startswith(f'RACE-{task}'):
+            fname = f'{result_path}/{file}/output.log'
+            if os.path.exists(fname):
+                results = gather_numbers(fname,
+                    [f'metrics for dev-{task}:', f'metrics for test-{task}:'],
+                    [f'dev-{task}:', f'test-{task}:'],
+                    [9, 9])
+                dev_candidate = results[f'dev-{task}:']
+                test_candidate = results[f'test-{task}:']
+                if len(dev_candidate) > 0:
+                    assert len(dev_candidate) == len(test_candidate)
+                    dev.append(max(dev_candidate))
+                    test.append(max(test_candidate))
+    if len(dev) > 0:
+        if len(dev) % 2 == 1:
+            median_idx = dev.index(statistics.median(dev))
+        else:
+            median_idx = dev.index(statistics.median_high(dev))
+        print(f'RACE-{task} how Megatron paper reported: test result from the median of dev results {test[median_idx]}')
+        print(f'RACE-{task} other results:')
+        print(f'RACE-{task} dev results {dev}, median {statistics.median(dev)}, mean {statistics.mean(dev)}, std {statistics.stdev(dev)}')
+        print(f'RACE-{task} test results {test}, median {statistics.median(test)}, mean {statistics.mean(test)}, std {statistics.stdev(test)}')
+    else:
+        print(f"Didn't find any RACE-{task} result")
+
+def gather_finetune_results(result_path):
+    print(f'Gather finetune results for {result_path}')
+    gather_MNLI_results(result_path)
+    gather_QQP_results(result_path)
+    gather_RACE_results(result_path, 'middle')
+    gather_RACE_results(result_path, 'high')
+
+if __name__ == '__main__':
+    result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/'
+    gather_finetune_results(result_path)
\ No newline at end of file
diff --git a/examples/data_efficiency/bert/pile_data_download_preprocess.py b/examples/data_efficiency/bert/pile_data_download_preprocess.py
new file mode 100644
index 000000000..1eb34124b
--- /dev/null
+++ b/examples/data_efficiency/bert/pile_data_download_preprocess.py
@@ -0,0 +1,129 @@
+import zstandard
+import sys
+import time
+import os
+
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir,os.path.pardir,os.path.pardir)))
+from megatron.data import indexed_dataset
+
+def pile_download(download_url, file_path, i):
+    start = time.time()
+    zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
+    download_path = f"{download_url}{i:02}.jsonl.zst"
+    if not os.path.exists(zstd_file_path):
+        os.system(f"wget -P {file_path} {download_path}")
+        print(f"Finished downloading chunk {i} in {time.time() - start} sec")
+
+def pile_decompress(download_url, file_path, i):
+    zstd_file_path = f"{file_path}{i:02}.jsonl.zst"
+    output_path = f"{file_path}{i:02}.jsonl"
+    if not os.path.exists(output_path):
+        if not os.path.exists(zstd_file_path):
+            pile_download(download_url, file_path, i)
+        start = time.time()
+        with open(zstd_file_path, 'rb') as compressed:
+            decomp = zstandard.ZstdDecompressor()
+            with open(output_path, 'wb') as destination:
+                decomp.copy_stream(compressed, destination)
+        os.remove(zstd_file_path)
+        print(f"Finished decompressing chunk {i} in {time.time() - start} sec")
+
+def pile_preprocess(download_url, file_path, vocab_file, num_workers, i):
+    json_file_path = f"{file_path}{i:02}.jsonl"
+    output_prefix = f"{file_path}pile_bert_train_{i:02}"
+    if not os.path.exists(f"{output_prefix}_text_sentence.idx"):
+        if not os.path.exists(json_file_path):
+            pile_decompress(download_url, file_path, i)
+        start = time.time()
+        cmd = f"python ../../tools/preprocess_data.py \
+                --input {json_file_path} \
+                --output-prefix {output_prefix} \
+                --vocab {vocab_file} \
+                --dataset-impl mmap \
+                --tokenizer-type BertWordPieceLowerCase \
+                --split-sentences \
+                --workers {num_workers} "
+        # It's possible to hit MemoryError during above cmd since the memory
+        # usage is proportional to num_workers. In this case we delete the
+        # incomplete output and user shall retry with smaller num_workers.
+        # Our experience show that chunk 6, 7, 9, 17, 18, 20, 21, 24, 27
+        # particularly have large memory usage.
+        if os.system(cmd) == 0: # Success
+            os.remove(json_file_path)
+        else:
+            print(f"Error: chunk {i} preprocessing got error, delete \
+                    incomplete output. If MemoryError appeared, please retry \
+                    with num_workers smaller than {num_workers}.")
+            if os.path.exists(f"{output_prefix}_text_sentence.idx"):
+                os.remove(f"{output_prefix}_text_sentence.idx")
+            if os.path.exists(f"{output_prefix}_text_sentence.bin"):
+                os.remove(f"{output_prefix}_text_sentence.bin")
+        print(f"Finished preprocessing chunk {i} in {time.time() - start} sec")
+
+def pile_merge(file_path):
+    start = time.time()
+    num_chunks = 30
+    vocab_size = 30524
+    for i in range(num_chunks):
+        output_prefix = f"{file_path}pile_bert_train_{i:02}"
+        assert os.path.exists(f"{output_prefix}_text_sentence.idx")
+        assert os.path.exists(f"{output_prefix}_text_sentence.bin")
+    builder = indexed_dataset.make_builder(
+        f"{file_path}pile_bert_train_text_sentence.bin", impl="mmap",
+        vocab_size=vocab_size)
+    for i in range(num_chunks):
+        chunk_file = f"{file_path}pile_bert_train_{i:02}_text_sentence"
+        print(f"Merging file {chunk_file}")
+        builder.merge_file_(chunk_file)
+    print("Finalizing merged file ...")
+    builder.finalize(f"{file_path}pile_bert_train_text_sentence.idx")
+    print(f"Finished merging in {time.time() - start} sec")
+    # After verifying the merged data with real training, you may want to
+    # delete the data chunks.
+    # for i in range(num_chunks):
+    #     output_prefix = f"{file_path}pile_bert_train_{i:02}"
+    #     os.remove(f"{output_prefix}_text_sentence.idx")
+    #     os.remove(f"{output_prefix}_text_sentence.bin")
+
+if __name__ == '__main__':
+    # Path to download and store all the output files during the whole process.
+    # Estimated max storage usage would be around 1.6 TB (or 780GB if skip the
+    # final merge). Memory usage is proportional to the num_workers below (can
+    # be as high as O(300GB) if num_workers is around 20).
+    file_path = "/blob/data/the_pile_bert/"
+    # The raw Pile data has 30 compressed .zst chunks. To run on single
+    # machine for all chunks, run "python prepare_pile_data.py range 0 30".
+    # You can also split and run on multiple machines to speed up, since
+    # processing one chunk can take hours. The whole process only uses CPU.
+    if sys.argv[1] == "merge":
+        # "python prepare_pile_data.py merge" means merge all 30 processed data
+        # chunks. Run it only after all 30 chunks are preprocessed. The memory
+        # usage during merge is about 600GB. If you don't have enough memory,
+        # one solution is to directly use the 30 data chunks as multiple
+        # datasets. See '--data-path' in
+        # github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/arguments.py
+        pile_merge(file_path)
+    else:
+        if sys.argv[1] == "range":
+            # "python prepare_pile_data.py range 0 30" means process chunk 0-29
+            selected_chunk = range(int(sys.argv[2]), int(sys.argv[3]))
+        else:
+            # "python prepare_pile_data.py 2 5 8" means process chunk 2, 5, 8
+            selected_chunk = [int(x) for x in sys.argv[1:]]
+        print("selected_chunk: ", selected_chunk)
+        # Number of process. Adjust based on your CPU/Memory.
+        num_workers = 20
+        # Where the raw Pile data can be downloaded. The url may change in
+        # future. Contact EleutherAI (https://github.com/EleutherAI/the-pile)
+        # if this url does not work.
+        download_url = "https://the-eye.eu/public/AI/pile/train/"
+        vocab_file = "bert-large-uncased-vocab.txt"
+        vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
+        if not os.path.exists(vocab_file):
+            os.system(f"wget {vocab_url}")
+        os.makedirs(file_path, exist_ok=True)
+
+        for i in selected_chunk:
+            pile_preprocess(download_url, file_path, vocab_file, num_workers, i)
diff --git a/examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json b/examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json
new file mode 100644
index 000000000..38846c404
--- /dev/null
+++ b/examples/data_efficiency/bert/pretrain/ds_config_bert_1clmetric_TEMPLATE.json
@@ -0,0 +1,74 @@
+{
+  "train_batch_size": GBSIZE,
+  "train_micro_batch_size_per_gpu": MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE,
+    "elastic_checkpoint": true
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false,
+  "dataloader_drop_last": true,
+  "data_efficiency": {
+    "enabled": true,
+    "seed": DATA_EFFICIENCY_SEED,
+    "data_routing": {
+      "enabled": LTD_ENABLED,
+      "random_ltd":{
+        "enabled": LTD_ENABLED,
+        "total_layer_num": 24,
+        "random_ltd_layer_num": 22,
+        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
+        "model_mask_name": "attention_mask",
+        "model_type": "encoder",
+        "hidden_state_order": "seq_batch_dim",
+        "random_ltd_schedule": {
+          "min_value": LTD_MIN,
+          "max_value": LTD_MAX,
+          "schedule_type":"fixed_linear",
+          "schedule_config": {
+            "require_steps": LTD_STEP,
+            "seq_per_step": 16
+          }
+        }
+      } 
+    },
+    "data_sampling": {
+      "enabled": CL_ENABLED,
+      "num_workers": DATA_SAMPLING_NUM_WORKERS,
+      "curriculum_learning": {
+        "enabled": CL_ENABLED,
+        "data_cluster_path": "CL_CLUSTER_PATH",
+        "curriculum_metrics": {
+          "CL_1st_METRIC_NAME": {
+            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
+            "index_to_metric_path": "CL_1st_METRIC_PATH",
+            "difficulty_type": "CL_1st_DIFF_TYPE",
+            "clustering_type": "CL_1st_CLUSTER_TYPE",
+            "min_difficulty": CL_1st_MIN,
+            "max_difficulty": CL_1st_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_1st_TOTAL_STEP,
+              "difficulty_step": CL_1st_DIFF_STEP,
+              "root_degree": CL_1st_ROOT
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json b/examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json
new file mode 100644
index 000000000..2f7268dd3
--- /dev/null
+++ b/examples/data_efficiency/bert/pretrain/ds_config_bert_2clmetrics_TEMPLATE.json
@@ -0,0 +1,88 @@
+{
+  "train_batch_size": GBSIZE,
+  "train_micro_batch_size_per_gpu": MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE,
+    "elastic_checkpoint": true
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false,
+  "dataloader_drop_last": true,
+  "data_efficiency": {
+    "enabled": true,
+    "seed": DATA_EFFICIENCY_SEED,
+    "data_routing": {
+      "enabled": LTD_ENABLED,
+      "random_ltd":{
+        "enabled": LTD_ENABLED,
+        "total_layer_num": 24,
+        "random_ltd_layer_num": 22,
+        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
+        "model_mask_name": "attention_mask",
+        "model_type": "encoder",
+        "hidden_state_order": "seq_batch_dim",
+        "random_ltd_schedule": {
+          "min_value": LTD_MIN,
+          "max_value": LTD_MAX,
+          "schedule_type":"fixed_linear",
+          "schedule_config": {
+            "require_steps": LTD_STEP,
+            "seq_per_step": 16
+          }
+        }
+      } 
+    },
+    "data_sampling": {
+      "enabled": CL_ENABLED,
+      "num_workers": DATA_SAMPLING_NUM_WORKERS,
+      "curriculum_learning": {
+        "enabled": CL_ENABLED,
+        "data_cluster_path": "CL_CLUSTER_PATH",
+        "curriculum_metrics": {
+          "CL_1st_METRIC_NAME": {
+            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
+            "index_to_metric_path": "CL_1st_METRIC_PATH",
+            "difficulty_type": "CL_1st_DIFF_TYPE",
+            "clustering_type": "CL_1st_CLUSTER_TYPE",
+            "min_difficulty": CL_1st_MIN,
+            "max_difficulty": CL_1st_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_1st_TOTAL_STEP,
+              "difficulty_step": CL_1st_DIFF_STEP,
+              "root_degree": CL_1st_ROOT
+            }
+          },
+          "CL_2nd_METRIC_NAME": {
+            "index_to_sample_path": "CL_2nd_SAMPLE_PATH",
+            "index_to_metric_path": "CL_2nd_METRIC_PATH",
+            "difficulty_type": "CL_2nd_DIFF_TYPE",
+            "clustering_type": "CL_2nd_CLUSTER_TYPE",
+            "min_difficulty": CL_2nd_MIN,
+            "max_difficulty": CL_2nd_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_2nd_TOTAL_STEP,
+              "difficulty_step": CL_2nd_DIFF_STEP,
+              "root_degree": CL_2nd_ROOT
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh
new file mode 100644
index 000000000..551ca3118
--- /dev/null
+++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh
@@ -0,0 +1,472 @@
+#!/bin/bash
+dir=`pwd`
+###############################################################################
+### Main configs
+### The main configs are from Megatron-LM paper
+### https://arxiv.org/abs/1909.08053. Choose based on your desired model size
+### or build your own configs.
+seq_len=512
+global_batch_size=1024
+# lr=1e-4
+lr=$1
+min_lr=1e-5
+
+## init_std is the standard deviation for weight initialization. Usually larger
+## model needs lower std. Here we roughly follow a heuristic equation of
+## sqrt(1/3/hidden_size) from https://arxiv.org/pdf/2201.11990.pdf
+
+## In addition, we find that the 3.9B model (even after tuning init_std) has
+## NaN loss issue from the beginning thus unable to train. This is probably
+## because in this example we use the public Pile data, which is a more diverse
+## (and potentially more noisy) data than what used in Megatron paper. One
+## potential solution is only use the sub datasets in Pile that are also
+## used by Megatron paper.
+
+## BERT 110M (same config as original BERT-Base model)
+## This config is not included in Megatron-LM paper
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+# init_std=0.02
+
+## BERT 336M (same config as original BERT-Large model)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+init_std=0.02
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+# init_std=0.013
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+# init_std=0.011
+###############################################################################
+### Training duration configs
+## The main termination condition, original Megatron paper trains for 2M iters.
+## We changed to token-based termination since data efficiency techniques could
+## change token per step.
+calc() { awk "BEGIN{ printf \"%.0f\n\", $* }"; }
+# train_iters_in_million=2
+train_iters_in_million=$2
+train_tokens=$(calc $train_iters_in_million*1000000*$seq_len*$global_batch_size)
+train_tokens_in_billion=$(calc $train_tokens/1000000000)
+
+## A large enough number of iters, just to make sure we index enough data. The
+## only effective termination condition is the train_tokens above.
+train_iters=4000000
+
+## Another wall-clock time termination condition in minutes. Set it large
+## enough to avoid undesired early termination.
+exit_duration=30000000
+###############################################################################
+### lr configs
+## lr warmup and decay duration. Original Megatron paper uses 10000 warmup
+## iters. We changed lr decay to token based since data efficiency techniques
+## could change token per step.
+lr_warmup_iters=10000
+lr_decay_tokens_in_billion=${train_tokens_in_billion}
+lr_decay_tokens=${train_tokens}
+lr_decay_style="linear"
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+## Note that currently both curriculum learning and random-LTD are NOT
+## compatible with pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO-based data parallelism, stage=0 will disable ZeRO
+zero_stage=0
+
+## Total number of GPUs. ds_ssh is from DeepSpeed library.
+num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
+
+## Data parallel size.
+dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
+
+## Micro batch size per GPU
+## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
+## Reduce it manually if GPU OOM
+batch_size=$(( ${global_batch_size} / ${dp_size} ))
+###############################################################################
+### Random layerwise token dropping (random-LTD) configs
+## random-LTD's main switch. "false" means disabled. "true" means enabled.
+ltd_enabled=${3:-'false'}
+## How much dropping ratio to start with. The value denotes the seqlen after
+## dropping.
+ltd_start=${4:-512}
+## How many steps for random-LTD to gradually reduce dropping ratio to zero.
+ltd_step_in_million=${5:-1}
+
+# ltd_enabled="true"
+# ltd_start=200
+# ltd_step_in_million=1.8
+ltd_step=$(calc $ltd_step_in_million*1000000)
+
+## For BERT pretraining, we observe that random-LTD when combined with zero
+## dropout can achieve better finetune accuracy on certain tasks. However, this
+## is not guaranteed for all models/tasks. It is still recommend to try both
+## with and without dropout for random-LTD.
+dropout=${6:-0.1}
+###############################################################################
+### Curriculum learning (CL) configs
+## CL's main switch. "false" means disabled. "true" means enabled.
+cl_enabled=${7:-'false'}
+## Number of CL metrics to use.
+cl_num_metric=${8:-1}
+
+## Name of difficulty metric
+cl_1st_metric=${9:-'dummy'}
+## Path to the data indexes for this difficulty metric. Samples on ith row of
+## index_to_sample have the difficulty value equals to ith row of
+## index_to_metric.
+cl_1st_index_to_sample_path=${10:-'dummy'}
+cl_1st_index_to_metric_path=${11:-'dummy'}
+## During training, whether increase difficulty by value- or percentile-based.
+cl_1st_difficulty_type=${12:-'value'}
+## "single_cluster" means no clustering required and probably CL is achieved by
+## data postprocessing. "schedule_based" means will cluster data based on the
+## difficulty schedule (pacing function) below.
+cl_1st_clustering_type=${13:-'single_cluster'}
+## Start difficulty
+cl_1st_min=${14:-512}
+## End difficulty
+cl_1st_max=${15:-512}
+## Total step to reach end difficulty
+cl_1st_total_step_in_million=${16:-1}
+## When changing difficulty, always make sure it's a multiple of the
+## difficulty_step below.
+cl_1st_difficulty_step=${17:-1}
+## Root degree of the schedule (pacing function).
+cl_1st_root=${18:-1}
+
+cl_2nd_metric=${19:-'dummy'}
+cl_2nd_index_to_sample_path=${20:-'dummy'}
+cl_2nd_index_to_metric_path=${21:-'dummy'}
+cl_2nd_difficulty_type=${22:-'value'}
+cl_2nd_clustering_type=${23:-'single_cluster'}
+cl_2nd_min=${24:-2048}
+cl_2nd_max=${25:-2048}
+cl_2nd_total_step_in_million=${26:-1}
+cl_2nd_difficulty_step=${27:-1}
+cl_2nd_root=${28:-1}
+
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# ## The *_index_to_sample_percentile_merged is a concatenated index for perf
+# ## improvement, but it only works when you set difficulty_type="percentile" in
+# ## ds_config. If you use difficulty_type="value", you need to change this to
+# ## *_index_to_sample
+# # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="value"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=600
+# cl_1st_max=9069
+# cl_1st_total_step_in_million=0.96
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=128
+# cl_2nd_max=512
+# cl_2nd_total_step_in_million=0.96
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+
+cl_1st_total_step=$(calc $cl_1st_total_step_in_million*1000000)
+cl_2nd_total_step=$(calc $cl_2nd_total_step_in_million*1000000)
+###############################################################################
+### Misc configs
+log_interval=100
+eval_iters=10
+eval_interval=1000
+# num_save controls how frequent to save checkpoint. num_save=20 means that a
+# checkpoint will be saved every 5% of training. For longer training you would
+# want larger num_save to save more frequently, and vice versa.
+num_save=100
+estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
+save_interval=$((${estimated_train_iter} / ${num_save}))
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+
+## Whether or not log optimizer states (norms, max abs values) to tensorboard.
+## This is not required for training and might save GPU memory when turned off.
+log_optimizer_state="true"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d_%H.%M.%S")
+host="${HOSTNAME}"
+seed=1234
+## Number of workers for dataloader. We found that for BERT pre-training,
+## num_workers will greatly affect data loading time and overall training
+## time. In our experiment with 64 GPUs, the performance reaches peak at
+## num_workers = 4 but it may differ depending on hardware. Also note that
+## larger num_workers add more CPU computation/memory overhead.
+num_workers=4
+
+## Public the Pile dataset, see ../pile_data_download_preprocess.py about how
+## to download and preprocess the data. Change data_home to where you store the
+## pile_bert_train_text_sentence.bin and pile_bert_train_text_sentence.idx.
+data_home="/vc_data_blob/users/conglli/the_pile_bert"
+if [[ "$host" == *"webxt"* ]]; then
+    data_home="/blob/data/the_pile_bert"
+fi
+data_path="${data_home}/pile_bert_train_text_sentence"
+## train_idx_path forces Megatron to use a specific data index file generated
+## when we analyze data. This is needed because our index for curriculum
+## learning difficulty metric is based on this data index.
+train_idx_path="${data_home}/pile_bert_train_text_sentence_train_indexmap_exact5ep_509msl_0.10ssp_1234s.npy"
+
+vocab_path="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+prescale_grad="true"
+jobname="bert_${model_size}B_tok${train_tokens_in_billion}B"
+jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_iters}_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
+jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
+if [[ $zero_stage -gt 0 ]]; then
+    jobname="${jobname}_z${zero_stage}"
+    prescale_grad="false"
+fi
+if [[ $mp_size -gt 1 ]]; then
+    jobname="${jobname}_mp${mp_size}"
+fi
+if [ "${no_pp}" = "false" ]; then
+    jobname="${jobname}_pp${pp_size}"
+fi
+jobname="${jobname}_seed${seed}"
+if [ "${ltd_enabled}" = "true" ]; then
+    jobname="${jobname}_ltd_${ltd_start}_${ltd_step_in_million}M_drop${dropout}"
+fi
+if [ "${cl_enabled}" = "true" ]; then
+    jobname="${jobname}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step_in_million}M_${cl_1st_root}"
+    if [[ $cl_num_metric -gt 1 ]]; then
+        jobname="${jobname}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step_in_million}M_${cl_2nd_root}"
+    fi
+fi
+
+username=$(whoami)
+output_home="/blob/users/${username}/project/data_efficient_bert"
+log_path="${output_home}/log/"
+checkpoint_path="${output_home}/checkpoint/${jobname}"
+## Microsoft internal constraint: because tensorboard is logged by last rank,
+## it's better to put the path in NFS instead of Blob.
+tensorboard_dir="/vc_data/users/${username}/project/data_efficient_bert/tensorboard/"
+tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
+mkdir -p ${log_path}
+mkdir -p ${checkpoint_path}
+mkdir -p ${tensorboard_path}
+if [ "${cl_enabled}" = "true" ]; then
+    data_cluster_path="${output_home}/data_cluster/${jobname}"
+    mkdir -p ${data_cluster_path}
+fi
+###############################################################################
+data_options=" \
+    --vocab-file ${vocab_path} \
+    --data-path ${data_path} \
+    --data-impl mmap"
+
+## If CL is used, make sure to set "--split" the same as what you used during
+## offline data analysis&indexing.
+megatron_options=" \
+    --override-lr-scheduler \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --tensor-model-parallel-size ${mp_size} \
+    --init-method-std ${init_std} \
+    --lr-decay-tokens ${lr_decay_tokens} \
+    --lr-warmup-iters ${lr_warmup_iters} \
+    --micro-batch-size ${batch_size} \
+    --exit-duration-in-mins ${exit_duration} \
+    --global-batch-size ${global_batch_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --train-tokens ${train_tokens} \
+    --train-iters ${train_iters} \
+    --lr ${lr} \
+    --min-lr ${min_lr} \
+    --lr-decay-style ${lr_decay_style} \
+    --split 949,50,1 \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --save-interval ${save_interval} \
+    --weight-decay 1e-2 \
+    --clip-grad 1.0 \
+    --num-workers ${num_workers} \
+    --fp16 \
+    --seed ${seed} \
+    --load ${checkpoint_path} \
+    --save ${checkpoint_path} \
+    --tensorboard-queue-size 1 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    --tensorboard-dir ${tensorboard_path}"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --checkpoint-activations"
+fi
+
+if [ "${log_optimizer_state}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --log-optimizer-states-to-tensorboard"
+fi
+
+if [ "${ltd_enabled}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --attention-dropout ${dropout} \
+    --hidden-dropout ${dropout} \
+    --random-ltd"
+fi
+
+if [ "${cl_enabled}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --train-idx-path ${train_idx_path} \
+    --data-efficiency-curriculum-learning"
+fi
+
+config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_seed${seed}"
+if [ "${ltd_enabled}" = "true" ]; then
+    config_json="${config_json}_ltd_${ltd_start}_${ltd_step}"
+fi
+if [ "${cl_enabled}" = "true" ]; then
+    config_json="${config_json}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}"
+    if [[ $cl_num_metric -gt 1 ]]; then
+        config_json="${config_json}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}"
+    fi
+fi
+config_json="${config_json}.json"
+if [[ $cl_num_metric -gt 1 ]]; then
+template_json="ds_config_bert_2clmetrics_TEMPLATE.json"
+sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
+    | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \
+    | sed "s/LTD_ENABLED/${ltd_enabled}/" \
+    | sed "s/LTD_MIN/${ltd_start}/" \
+    | sed "s/LTD_MAX/${seq_len}/" \
+    | sed "s/LTD_STEP/${ltd_step}/" \
+    | sed "s/CL_ENABLED/${cl_enabled}/" \
+    | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \
+    | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \
+    | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \
+    | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \
+    | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \
+    | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \
+    | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \
+    | sed "s/CL_1st_MIN/${cl_1st_min}/" \
+    | sed "s/CL_1st_MAX/${cl_1st_max}/" \
+    | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \
+    | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \
+    | sed "s/CL_1st_ROOT/${cl_1st_root}/" \
+    | sed "s#CL_2nd_METRIC_NAME#${cl_2nd_metric}#" \
+    | sed "s#CL_2nd_SAMPLE_PATH#${cl_2nd_index_to_sample_path}#" \
+    | sed "s#CL_2nd_METRIC_PATH#${cl_2nd_index_to_metric_path}#" \
+    | sed "s#CL_2nd_DIFF_TYPE#${cl_2nd_difficulty_type}#" \
+    | sed "s#CL_2nd_CLUSTER_TYPE#${cl_2nd_clustering_type}#" \
+    | sed "s/CL_2nd_MIN/${cl_2nd_min}/" \
+    | sed "s/CL_2nd_MAX/${cl_2nd_max}/" \
+    | sed "s/CL_2nd_TOTAL_STEP/${cl_2nd_total_step}/" \
+    | sed "s/CL_2nd_DIFF_STEP/${cl_2nd_difficulty_step}/" \
+    | sed "s/CL_2nd_ROOT/${cl_2nd_root}/" \
+      > ${config_json}
+else
+template_json="ds_config_bert_1clmetric_TEMPLATE.json"
+sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
+    | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \
+    | sed "s/LTD_ENABLED/${ltd_enabled}/" \
+    | sed "s/LTD_MIN/${ltd_start}/" \
+    | sed "s/LTD_MAX/${seq_len}/" \
+    | sed "s/LTD_STEP/${ltd_step}/" \
+    | sed "s/CL_ENABLED/${cl_enabled}/" \
+    | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \
+    | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \
+    | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \
+    | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \
+    | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \
+    | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \
+    | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \
+    | sed "s/CL_1st_MIN/${cl_1st_min}/" \
+    | sed "s/CL_1st_MAX/${cl_1st_max}/" \
+    | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \
+    | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \
+    | sed "s/CL_1st_ROOT/${cl_1st_root}/" \
+      > ${config_json}
+fi
+
+deepspeed_options=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --pipeline-model-parallel-size ${pp_size}"
+
+if [[ "${no_pp}" = "true" ]]; then
+deepspeed_options="${deepspeed_options} \
+    --no-pipeline-parallel"
+fi
+
+if [ "${activation_checkpoint}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+    --deepspeed-activation-checkpointing"
+fi
+
+## When saving checkpoint to a storage with cache, their could be consistency
+## issue of the pointer to latest checkpoint. Here we find the correct pointer
+## and broadcast it to all nodes.
+iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
+iteration_file_2="$checkpoint_path/latest"
+iteration=0
+for (( node = 0; node <= num_node-1; node++ ))
+do
+    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
+        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
+        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
+    fi
+done
+if [[ $iteration -gt 0 ]]; then
+    iteration_2="global_step${iteration}"
+    ds_ssh "echo $iteration > $iteration_file"
+    ds_ssh "echo $iteration_2 > $iteration_file_2"
+fi
+
+deepspeed ${dir}/../../../../pretrain_bert.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
new file mode 100644
index 000000000..46c6c48b5
--- /dev/null
+++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
@@ -0,0 +1,241 @@
+###############################################################################
+### Each block below is one pretraining setup. Uncomment one block to try.
+###############################################################################
+### Baseline cases, mostly based on Megatron-LM's BERT-Large hyperparameters,
+### but with some changes (different LR schedule).
+## Baseline 1049B tokens (100%):
+# lr=1e-4
+# train_iters_in_million=2
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million}
+###############################################################################
+### Curriculum learning (CL) + Random layerwise token dropping (random-LTD).
+### Due to resource constraints, we did not finish training any model with this
+### setup. This example is just to demonstrate that CL+random-LTD can run for
+### BERT pretraining.
+## CL+random-LTD 1049B tokens (100%):
+# lr=1e-4
+# train_iters_in_million=2
+# ltd_enabled="true"
+# ltd_start=200
+# ltd_step_in_million=1.8
+# dropout=0
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="value"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=600
+# cl_1st_max=9069
+# cl_1st_total_step_in_million=0.96
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=128
+# cl_2nd_max=512
+# cl_2nd_total_step_in_million=0.96
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} \
+###############################################################################
+### Random layerwise token dropping (random-LTD).
+## random-LTD 723B tokens (69%):
+# lr=1.45e-4
+# train_iters_in_million=1.38
+# ltd_enabled="true"
+# ltd_start=200
+# ltd_step_in_million=1.8
+# dropout=0
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout}
+###############################################################################
+### Curriculum learning (CL).
+## CL vocab rarity 734B tokens (70%):
+# lr=1.4e-4
+# train_iters_in_million=1.4
+# ltd_enabled="false"
+# ltd_start=512
+# ltd_step_in_million=1
+# dropout=0.1
+# cl_enabled="true"
+# cl_num_metric=1
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="value"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=600
+# cl_1st_max=9069
+# cl_1st_total_step_in_million=0.7
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root}
+###############################################################################
+## CL vocab rarity + seqlen truncation 1049B tokens (100%):
+# lr=1e-4
+# train_iters_in_million=2
+# ltd_enabled="false"
+# ltd_start=512
+# ltd_step_in_million=1
+# dropout=0.1
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="value"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=600
+# cl_1st_max=9069
+# cl_1st_total_step_in_million=0.96
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=128
+# cl_2nd_max=512
+# cl_2nd_total_step_in_million=0.96
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+## CL vocab rarity + seqlen reorder 1049B tokens (100%):
+# lr=1e-4
+# train_iters_in_million=2
+# ltd_enabled="false"
+# ltd_start=512
+# ltd_step_in_million=1
+# dropout=0.1
+# cl_enabled="true"
+# cl_num_metric=1
+# cl_1st_metric="seqlenvocabrarity"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen_vocab_rarity/seqlen_vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen_vocab_rarity/seqlen_vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=5
+# cl_1st_max=100
+# cl_1st_total_step_in_million=0.96
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root}
+###############################################################################
+## CL vocab rarity 1049B tokens (100%):
+# lr=1e-4
+# train_iters_in_million=2
+# ltd_enabled="false"
+# ltd_start=512
+# ltd_step_in_million=1
+# dropout=0.1
+# cl_enabled="true"
+# cl_num_metric=1
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="value"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=600
+# cl_1st_max=9069
+# cl_1st_total_step_in_million=0.96
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root}
+###############################################################################
+## CL seqlen truncation 1049B tokens (100%):
+# lr=1e-4
+# train_iters_in_million=2
+# ltd_enabled="false"
+# ltd_start=512
+# ltd_step_in_million=1
+# dropout=0.1
+# cl_enabled="true"
+# cl_num_metric=1
+# cl_1st_metric="seqlen_truncate"
+# cl_1st_index_to_sample_path="dummy"
+# cl_1st_index_to_metric_path="dummy"
+# cl_1st_difficulty_type="value"
+# cl_1st_clustering_type="single_cluster"
+# cl_1st_min=128
+# cl_1st_max=512
+# cl_1st_total_step_in_million=0.96
+# cl_1st_difficulty_step=8
+# cl_1st_root=1
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root}
+###############################################################################
+## CL seqlen reorder 1049B tokens (100%):
+# lr=1e-4
+# train_iters_in_million=2
+# ltd_enabled="false"
+# ltd_start=512
+# ltd_step_in_million=1
+# dropout=0.1
+# cl_enabled="true"
+# cl_num_metric=1
+# cl_1st_metric="seqlen"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen/seqlen_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/seqlen/seqlen_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="single_cluster"
+# cl_1st_min=5
+# cl_1st_max=100
+# cl_1st_total_step_in_million=0.96
+# cl_1st_difficulty_step=8
+# cl_1st_root=2
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root}
+###############################################################################
\ No newline at end of file
diff --git a/examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh b/examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh
new file mode 100644
index 000000000..3b1caf06f
--- /dev/null
+++ b/examples/data_efficiency/gpt/ds_analyze_gpt_data_map.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+num_workers=1 # Num nodes to run the map job
+num_threads=40 # Num threads on each node. Set this based on #CPU cores
+
+# If different data epochs have slightly different data samples (e.g., due
+# to randomness), then you need to specify large enough num_epochs that cover
+# whole pretraining. If different data epochs are the same, set num_epochs to
+# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
+# library will automatically handle reshuffling when reaching another epoch.
+num_epochs=1
+
+# Which node is this node (start with 0 and end with num_workers-1). This
+# script only launch the map job on 1 worker node, since we don't expect
+# running on many nodes and workers don't need any communication. But you
+# can modify this script to add a MPI/torch distributed launcher.
+worker_id=$1
+save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/"
+
+metric='total_vocab_freq'
+# metric='vocab_rarity' # this requires the result of total_vocab_freq
+
+seq_len=2048
+batch_size=10000
+
+jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-map-worker${worker_id}"
+# Public the Pile dataset, can be downloaded at
+# https://mystic.the-eye.eu/public/AI/pile_neox/
+## Change data_home to your own training data path.
+# data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
+data_home="/blob/data/the_pile_public_merged_nopreprocessing"
+data_path="${data_home}/pile_text_document"
+
+vocab_path="gpt2-vocab.json"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+fi
+merge_path="gpt2-merges.txt"
+if [ ! -f "$merge_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+fi
+
+# Make sure the "--split" is the same as what you will use for pre-training.
+options=" \
+    --analyzing-task map \
+    --analyzing-data-type GPT \
+    --analyzing-metric ${metric} \
+    --analyzing-num-workers ${num_workers} \
+    --analyzing-worker-id ${worker_id} \
+    --analyzing-num-threads ${num_threads} \
+    --vocab-file ${vocab_path} \
+    --merge-file ${merge_path} \
+    --data-path ${data_path} \
+    --data-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --micro-batch-size ${batch_size} \
+    --global-batch-size ${batch_size} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --num-layers 1 \
+    --hidden-size 1 \
+    --num-attention-heads 1 \
+    --split 949,50,1 \
+    --distributed-backend gloo \
+    --train-data-exact-num-epochs ${num_epochs} \
+    --return-data-index \
+    --save-interval 1 \
+    --save ${save_path}"
+
+python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
diff --git a/examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh b/examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh
new file mode 100644
index 000000000..a1242ea94
--- /dev/null
+++ b/examples/data_efficiency/gpt/ds_analyze_gpt_data_reduce.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Set these 2 to the same as what you used during map job. We need these 2
+# configs to know how many map job result files do we have.
+num_workers=1
+num_threads=40
+# Reduce job only has 1 worker but can accelerate by multithreading.
+num_threads_reduce=40
+
+# If different data epochs have slightly different data samples (e.g., due
+# to randomness), then you need to specify large enough num_epochs that cover
+# whole pretraining. If different data epochs are the same, set num_epochs to
+# 1 to only index 1 epoch, and during pretraining DeepSpeed data efficiency
+# library will automatically handle reshuffling when reaching another epoch.
+num_epochs=1
+
+save_path="/blob/users/conglli/data/analysis_pile_gpt_${num_epochs}epoch/"
+
+metric='total_vocab_freq'
+# metric='vocab_rarity' # this requires the result of total_vocab_freq
+
+seq_len=2048
+batch_size=10000
+
+jobname="gpt-pile-analyzing-${metric}-${num_epochs}epoch-reduce"
+# Public the Pile dataset, can be downloaded at
+# https://mystic.the-eye.eu/public/AI/pile_neox/
+## Change data_home to your own training data path.
+# data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
+data_home="/blob/data/the_pile_public_merged_nopreprocessing"
+data_path="${data_home}/pile_text_document"
+
+vocab_path="gpt2-vocab.json"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+fi
+merge_path="gpt2-merges.txt"
+if [ ! -f "$merge_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+fi
+
+# Make sure the "--split" is the same as what you will use for pre-training.
+options=" \
+    --analyzing-task reduce \
+    --analyzing-data-type GPT \
+    --analyzing-metric ${metric} \
+    --analyzing-num-workers ${num_workers} \
+    --analyzing-num-threads ${num_threads} \
+    --analyzing-num-threads-reduce ${num_threads_reduce} \
+    --vocab-file ${vocab_path} \
+    --merge-file ${merge_path} \
+    --data-path ${data_path} \
+    --data-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --micro-batch-size ${batch_size} \
+    --global-batch-size ${batch_size} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --num-layers 1 \
+    --hidden-size 1 \
+    --num-attention-heads 1 \
+    --split 949,50,1 \
+    --distributed-backend gloo \
+    --train-data-exact-num-epochs ${num_epochs} \
+    --return-data-index \
+    --save-interval 1 \
+    --save ${save_path}"
+
+python ../analyze_data.py ${options} &> ${jobname}.log
\ No newline at end of file
diff --git a/examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json b/examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json
new file mode 100644
index 000000000..09b276c88
--- /dev/null
+++ b/examples/data_efficiency/gpt/eval/ds_config_eval_dummy.json
@@ -0,0 +1,28 @@
+{
+"train_batch_size" : 2048,
+"train_micro_batch_size_per_gpu": 16,
+"steps_per_print": 10,
+
+"zero_optimization": {
+    "stage": 0,
+    "elastic_checkpoint": true
+},
+
+"gradient_clipping": 1.0,
+"prescale_gradients": true,
+
+"fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+},
+
+"bf16": {
+    "enabled": false
+},
+
+"wall_clock_breakdown" : false
+}
\ No newline at end of file
diff --git a/examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh b/examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh
new file mode 100644
index 000000000..4c16e608c
--- /dev/null
+++ b/examples/data_efficiency/gpt/eval/ds_evalharness_1gpu.sh
@@ -0,0 +1,77 @@
+## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md
+## and follow the steps of installation/data downloading.
+
+## Code below only works when you run each evalharness task on a single GPU.
+## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh
+checkpoint_path=$1
+config_path=$2
+result_path=$3
+rank=$4
+tasks=$5
+hostname=$6
+master_port=$(( 12345 + ${rank} ))
+batch_size=$7
+num_fewshot=$8
+
+mp_size=1
+pp_size=1
+no_pp="true"
+ep_size=1
+
+vocab_file="gpt2-vocab.json"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+fi
+merge_file="gpt2-merges.txt"
+if [ ! -f "$merge_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+fi
+
+export HF_DATASETS_OFFLINE=1
+
+dir2=$(dirname "$checkpoint_path")
+dirname=$(basename "$dir2")/$(basename "$checkpoint_path")
+result_path="${result_path}/${dirname}"
+mkdir -p $result_path
+result_file="${result_path}/${tasks}_${num_fewshot}shot.json"
+
+# Dummy arguments to make megatron happy. No need to configure them.
+# The reason we don't need to configure them and many other arguments is
+# because the eval framework will read the arguments from checkpoint file.
+megatron_required_args="\
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1 \
+    --max-position-embeddings -1
+"
+
+command="../../../../tasks/eval_harness/evaluate.py \
+    --load ${checkpoint_path} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --moe-expert-parallel-size ${ep_size} \
+    --vocab-file ${vocab_file} \
+    --merge-file ${merge_file} \
+    --micro-batch-size ${batch_size} \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --disable-moe-token-dropping \
+    --adaptive_seq_len \
+    --eval_fp32 \
+    --num_fewshot ${num_fewshot} \
+    --task_list ${tasks} \
+    --results_path ${result_file} \
+    --deepspeed \
+    --deepspeed_config ${config_path} \
+    ${megatron_required_args} \
+    "
+
+if [[ "${no_pp}" = "true" ]]; then
+command="${command} \
+    --no-pipeline-parallel"
+fi
+
+launcher="deepspeed --include=$hostname:$rank --master_port=${master_port}"
+$launcher $command &> "${result_path}/${tasks}_${num_fewshot}shot.log"
\ No newline at end of file
diff --git a/examples/data_efficiency/gpt/eval/ds_evalharness_gather_result.py b/examples/data_efficiency/gpt/eval/ds_evalharness_gather_result.py
new file mode 100644
index 000000000..e0c0c332c
--- /dev/null
+++ b/examples/data_efficiency/gpt/eval/ds_evalharness_gather_result.py
@@ -0,0 +1,358 @@
+import json
+import os
+import math
+from math import log10, floor
+import copy
+
+def mean(arr):
+    return sum(arr) / len(arr)
+
+
+def pop_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
+
+
+def sample_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+
+
+def mean_stderr(arr):
+    return sample_stddev(arr) / math.sqrt(len(arr))
+
+
+def median(arr):
+    return arr[len(arr) // 2]
+
+metric_dict = {
+    "hellaswag":"acc_norm",
+    "lambada":"acc",
+    "triviaqa":"acc",
+    "webqs":"acc",
+    "winogrande":"acc",
+    "piqa":"acc_norm",
+    "arc_challenge":"acc_norm",
+    "arc_easy":"acc_norm",
+    "openbookqa":"acc_norm",
+    "race":"acc",
+    "boolq":"acc",
+    "cb":"acc",
+    "copa":"acc",
+    "rte":"acc",
+    "wic":"acc",
+    "wsc":"acc",
+    "multirc":"acc",
+    "record":"f1",
+    "anli_r1":"acc",
+    "anli_r2":"acc",
+    "anli_r3":"acc",
+    "wikitext":"word_perplexity",
+    "logiqa":"acc_norm",
+    "mathqa":"acc_norm",
+    "mc_taco":"f1",
+    "mrpc":"acc",
+    "prost":"acc_norm",
+    "pubmedqa":"acc",
+    "qnli":"acc",
+    "qqp":"acc",
+    "sciq":"acc_norm",
+    "sst":"acc",
+    "wnli":"acc"
+}
+
+official_dict = {
+    "hellaswag":["HellaSwag","acc"],
+    "lambada":["LAMBADA","acc"],
+    "triviaqa":["TriviaQA","acc"],
+    "webqs":["WebQs","acc"],
+    "winogrande":["Winogrande","acc"],
+    "piqa":["PIQA","acc"],
+    "arc_challenge":["ARC Challenge","acc"],
+    "arc_easy":["ARC Easy","acc"],
+    "openbookqa":["OpenBookQA","acc"],
+    "race":["RACE-h","acc"],
+    "boolq":["BoolQ","acc"],
+    "cb":["CB","acc"],
+    "copa":["Copa","acc"],
+    "rte":["RTE","acc"],
+    "wic":["WiC","acc"],
+    "wsc":["WSC","acc"],
+    "multirc":["MultiRC","acc"],
+    "record":["ReCoRD","f1"],
+    "anli_r1":["ANLI R1","acc"],
+    "anli_r2":["ANLI R2","acc"],
+    "anli_r3":["ANLI R3","acc"],
+    "wikitext":["WikiText-2","ppl"],
+    "logiqa":["LogiQA","acc"],
+    "mathqa":["MathQA","acc"],
+    "mc_taco":["MC-TACO","f1"],
+    "mrpc":["MRPC","acc"],
+    "prost":["PROST","acc"],
+    "pubmedqa":["PubMedQA","acc"],
+    "qnli":["QNLI","acc"],
+    "qqp":["QQP","acc"],
+    "sciq":["SciQ","acc"],
+    "sst":["SST-2","acc"],
+    "wnli":["WNLI","acc"]
+}
+
+# When comparing with gpt3 paper, the most trustful tasks are the hellaswag to
+# anli_r3, who have >= 1000 samples (less variation), and have <= 43% data
+# contamination in the paper.
+gpt3paper_zeroshoteval = {
+    "hellaswag":[33.7,43.6,51.0,54.7,62.8,67.4,70.9,78.9],
+    "lambada":[42.7,54.3,60.4,63.6,67.1,70.3,72.5,76.2],
+    "triviaqa":[4.15,7.61,14.0,19.7,31.3,38.7,41.8,64.3],
+    "webqs":[1.77,3.20,4.33,4.63,7.92,7.73,8.22,14.4],
+    "winogrande":[52.0,52.1,57.4,58.7,62.3,64.5,67.9,70.2],
+    "piqa":[64.6,70.2,72.9,75.1,75.6,78.0,78.5,81.0],
+    "arc_challenge":[26.6,29.5,31.8,35.5,38.0,41.4,43.7,51.4],
+    "arc_easy":[43.6,46.5,53.0,53.8,58.2,60.2,63.8,68.8],
+    "anli_r1":[33.4,34.2,33.4,33.4,34.2,32.3,33.2,34.6],
+    "anli_r2":[33.2,31.9,33.3,33.3,33.8,33.5,33.5,35.4],
+    "anli_r3":[33.6,34.0,33.8,33.4,35.3,34.8,34.4,34.5],
+    "openbookqa":[35.6,43.2,45.2,46.8,53.0,50.4,55.6,57.6],
+    "race":[35.2,37.9,40.1,40.9,42.4,44.1,44.6,45.5],
+    "boolq":[49.7,60.3,58.9,62.4,67.1,65.4,66.2,60.5],
+    "cb":[0.00,32.1,8.93,19.6,19.6,28.6,19.6,46.4],
+    "copa":[66.0,68.0,73.0,77.0,76.0,80.0,84.0,91.0],
+    "rte":[47.7,49.8,48.4,56.0,46.6,55.2,62.8,63.5],
+    "wic":[0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00],
+    "wsc":[59.6,56.7,65.4,61.5,66.3,60.6,64.4,65.4],
+    "multirc":[4.72,9.65,12.3,13.6,14.3,18.4,24.2,27.6],
+    "record":[71.9,79.2,82.8,85.2,87.3,89.5,90.4,91.0]
+}
+
+gpt3paper_fewshoteval = {
+    "hellaswag":[33.5,43.1,51.3,54.9,62.9,67.3,71.3,79.3],
+    "lambada":[22.0,40.4,63.2,57.0,78.1,79.1,81.3,86.4],
+    "triviaqa":[6.96,16.3,26.5,32.1,42.3,51.6,57.5,71.2],
+    "webqs":[5.46,12.6,15.9,19.6,24.8,27.7,33.5,41.5],
+    "winogrande":[51.3,52.6,57.5,59.1,62.6,67.4,70.0,77.7],
+    "piqa":[64.3,69.4,72.0,74.3,75.4,77.8,79.9,82.3],
+    "arc_challenge":[25.5,28.4,32.3,36.7,39.5,43.7,44.8,51.5],
+    "arc_easy":[42.7,51.0,58.1,59.1,62.1,65.8,69.1,70.1],
+    "anli_r1":[32.1,32.5,30.9,32.5,33.5,33.1,33.3,36.8],
+    "anli_r2":[35.7,33.8,32.1,31.4,32.6,33.3,32.6,34.0],
+    "anli_r3":[35.0,34.4,35.1,36.0,32.7,33.9,34.5,40.2],
+    "openbookqa":[37.0,43.6,48.0,50.6,55.6,55.2,60.8,65.4],
+    "race":[34.3,37.0,40.4,41.4,42.3,44.7,45.1,46.8],
+    "boolq":[43.1,60.6,62.0,64.1,70.3,70.0,70.2,77.5],
+    "cb":[42.9,58.9,53.6,69.6,67.9,60.7,66.1,82.1],
+    "copa":[67.0,64.0,72.0,77.0,83.0,83.0,86.0,92.0],
+    "rte":[52.3,48.4,46.9,50.9,56.3,49.5,60.6,72.9],
+    "wic":[49.8,55.0,53.0,53.0,51.6,53.1,51.1,55.3],
+    "wsc":[58.7,60.6,54.8,49.0,62.5,67.3,75.0,75.0],
+    "multirc":[6.09,11.8,16.8,20.8,24.7,23.8,25.0,32.5],
+    "record":[70.7,77.9,82.1,84.0,87.5,88.8,89.8,90.1]
+}
+
+gpt3paper_zeroshoteval_index = {
+    "125M":0, # Small
+    "350M":1, # Medium
+    "760M":2, # Large
+    "1.3B":3, # XL
+    "2.7B":4,
+    "6.7B":5,
+    "13B":6,
+    "175B":7
+}
+
+def round_sig(x, sig=3):
+    if x == 0:
+        return 0
+    return round(x, sig-int(floor(log10(abs(x))))-1)
+
+def generate_result_table(tab_header, configs, task_order, caption, avg_range,
+    avg_tag, avg_only=False, fontsize="\\footnotesize", find_best=False,
+    candidate_range=None, candidate_task=None, split_name_by_space=False,
+    print_stderr=False, few_shot=False):
+    # Gather results
+    result_list = []
+    for i in range(len(configs)):
+        result_dict = {}
+        eval_path = configs[i][-1]
+        if "paper" in configs[i][0]:
+            assert eval_path is None
+        if eval_path is None:
+            assert "paper" in configs[i][0]
+            assert configs[i][1] in gpt3paper_zeroshoteval_index, "the second element has to be the model size"
+            paper_result_idx = gpt3paper_zeroshoteval_index[configs[i][1]]
+            if few_shot:
+                for task in gpt3paper_fewshoteval:
+                    result_dict[task] = [gpt3paper_fewshoteval[task][paper_result_idx]]
+            else:
+                for task in gpt3paper_zeroshoteval:
+                    result_dict[task] = [gpt3paper_zeroshoteval[task][paper_result_idx]]
+        else:
+            for file in os.listdir(eval_path):
+                if file.endswith(".json"):
+                    result = json.load(open(eval_path+"/"+file, "r"))
+                    for task in result['results']:
+                        if task != "wikitext":
+                            result_dict[task] = [100.0*result['results'][task][metric_dict[task]]]
+                        else:
+                            result_dict[task] = [result['results'][task][metric_dict[task]]]
+        result_list.append(result_dict)
+    avg_list = []
+    for i in range(len(configs)):
+        average_results = []
+        for j in range(len(avg_range)):
+            results = []
+            for k in range(avg_range[j]+1):
+                if task_order[k] in result_list[i]:
+                    results.append(result_list[i][task_order[k]][0])
+            if len(results) > 0:
+                average_results.append(float(sum(results))/len(results))
+            else:
+                average_results.append(0)
+        avg_list.append(average_results)
+
+    if find_best:
+        best_avg_value = [0 for _ in range(len(avg_range))]
+        best_avg_idx = [0 for _ in range(len(avg_range))]
+        best_task_value = [0 for _ in range(len(candidate_task))]
+        best_task_idx = [0 for _ in range(len(candidate_task))]
+        for i in range(candidate_range, len(configs)):
+            for j in range(len(avg_range)):
+                if avg_list[i][j] > best_avg_value[j]:
+                    best_avg_value[j] = avg_list[i][j]
+                    best_avg_idx[j] = i
+            for j in range(len(candidate_task)):
+                if result_list[i][candidate_task[j]] > best_task_value[j]:
+                    best_task_value[j] = result_list[i][candidate_task[j]]
+                    best_task_idx[j] = i
+        # reorder configs, result_list, avg_list to only keep the best cases
+        new_configs = configs[:candidate_range]
+        new_result_list = result_list[:candidate_range]
+        new_avg_list = avg_list[:candidate_range]
+        for i in range(len(avg_range)):
+            selected_config = copy.deepcopy(configs[best_avg_idx[i]])
+            selected_config[0] = "({})Best Avg{}".format(len(new_configs),
+                avg_tag[i])
+            new_configs.append(selected_config)
+            new_result_list.append(result_list[best_avg_idx[i]])
+            new_avg_list.append(avg_list[best_avg_idx[i]])
+
+        for i in range(len(candidate_task)):
+            selected_config = copy.deepcopy(configs[best_task_idx[i]])
+            selected_config[0] = "({})Best {}".format(len(new_configs),
+                official_dict[candidate_task[i]][0])
+            new_configs.append(selected_config)
+            new_result_list.append(result_list[best_task_idx[i]])
+            new_avg_list.append(avg_list[best_task_idx[i]])
+        configs = new_configs
+        result_list = new_result_list
+        avg_list = new_avg_list
+
+    # split the case names by space
+    if split_name_by_space:
+        max_num_row = 1
+        splitted_names = []
+        for i in range(len(configs)):
+            new_name = configs[i][0].split()
+            max_num_row = max(max_num_row, len(new_name))
+            splitted_names.append(new_name)
+        tab_header = ["" for _ in range(max_num_row-1)] + tab_header
+        for i in range(len(configs)):
+            padding = ["" for _ in range(max_num_row-len(splitted_names[i]))]
+            configs[i] = padding + splitted_names[i] + configs[i][1:]
+    
+    # generate the table
+    print("\\begin{table}")
+    print("\centering")
+    print(fontsize)
+    print("\caption{"+caption+"}")
+    text = "\\begin{tabular}{@{}l|"
+    for _ in range(len(configs)):
+        text += "c"
+    text += "@{}}"
+    print(text)
+    print("\\toprule")
+    for i in range(len(tab_header)):
+        text = "{} &".format(tab_header[i])
+        for j in range(len(configs)):
+            if j != len(configs) - 1:
+                text += (configs[j][i] + "& ")
+            else:
+                text += (configs[j][i] + "\\\\")
+        print(text)
+    print("\midrule")
+    for i in range(len(avg_range)):
+        text = ("Avg. " + avg_tag[i])
+        arr = []
+        for j in range(len(configs)):
+            arr.append(avg_list[j][i])
+            text += " & {}".format(round_sig(avg_list[j][i]))
+        text += "\\\\"
+        if print_stderr:
+            arr_mean = mean(arr)
+            arr_std = sample_stddev(arr)
+            text += " % mean {:.3f}, std {:.3f}, mean+1std {:.3f}, mean+2std {:.3f}, mean+3std {:.3f}".format(
+                arr_mean, arr_std, arr_mean+arr_std, arr_mean+arr_std*2, arr_mean+arr_std*3)
+        print(text)
+    if not avg_only:
+        print("\midrule")
+        for i in range(len(task_order)):
+            task = task_order[i]
+            text = "({}) {}".format(i, official_dict[task][0])
+            arr = []
+            for j in range(len(configs)):
+                result_dict = result_list[j]
+                if task in result_dict:
+                    text += " & {}".format(round_sig(result_dict[task][0]))
+                    arr.append(result_dict[task][0])
+                else:
+                    text += " & N/A"
+            text += "\\\\"
+            if print_stderr:
+                arr_mean = mean(arr)
+                arr_std = sample_stddev(arr)
+                if task != "wikitext":
+                    text += " % mean {:.3f}, std {:.3f}, mean+1std {:.3f}, mean+2std {:.3f}, mean+3std {:.3f}".format(
+                        arr_mean, arr_std, arr_mean+arr_std, arr_mean+arr_std*2, arr_mean+arr_std*3)
+                else:
+                    text += " % mean {:.3f}, std {:.3f}, mean-1std {:.3f}, mean-2std {:.3f}, mean-3std {:.3f}".format(
+                        arr_mean, arr_std, arr_mean-arr_std, arr_mean-arr_std*2, arr_mean-arr_std*3)
+            print(text)
+    print("\\bottomrule")
+    print("\end{tabular}")
+    print("\end{table}")
+    print("")
+    print("")
+
+if __name__ == '__main__':
+    task_order = ["hellaswag","lambada","triviaqa","webqs","winogrande","piqa",
+        "arc_challenge","arc_easy","anli_r1","anli_r2","anli_r3","openbookqa",
+        "race","boolq","copa","rte","wsc","multirc","record","wikitext"]
+    avg_range = [18]
+    avg_tag = ["0-18"]
+    tab_header = ["Case","Model size","Train tokens","Batch size","Bsz warmup","LR","min LR","LR warmup","LR decay","decay style"]
+
+    configs = [
+        ["(0)paper","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine", None], # gpt3 paper orig results, thus result path is None
+        ["(1)repro","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine",
+         '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/'],
+        ["(2)fixedBsz","125M","300B","256","N/A","6e-4","6e-5","3000M","260B","cosine",
+         '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'],
+        ["(3)fixedBsz 300B+minLR","125M","300B","256","N/A","6e-4","1e-6","3000M","300B","cosine",
+         '/blob/users/conglli/project/data_efficiency_gpt/eval_results/gpt-pile-0.125B-tok300B-lr6.0e-4-min1.0e-6-wup3000M-dcy300B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/']
+    ]
+    caption = 'Conglong: GPT-3 125M results zero-shot'
+    generate_result_table(tab_header, configs, task_order, caption, avg_range,
+        avg_tag, split_name_by_space=True, fontsize="\\tiny")
+
+    configs = [
+        ["(0)paper","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine", None], # gpt3 paper orig results, thus result path is None
+        ["(1)repro","125M","300B","256","4B","6e-4","6e-5","375M","260B","cosine",
+         '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/'],
+        ["(2)fixedBsz","125M","300B","256","N/A","6e-4","6e-5","3000M","260B","cosine",
+         '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'],
+        ["(3)fixedBsz 300B+minLR","125M","300B","256","N/A","6e-4","1e-6","3000M","300B","cosine",
+         '/blob/users/conglli/project/data_efficiency_gpt/eval_results_fewshot/gpt-pile-0.125B-tok300B-lr6.0e-4-min1.0e-6-wup3000M-dcy300B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/'],
+    ]
+    caption = 'Conglong: GPT-3 125M results few-shot'
+    generate_result_table(tab_header, configs, task_order, caption, avg_range,
+        avg_tag, split_name_by_space=True, fontsize="\\tiny", few_shot=True)
+
diff --git a/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh b/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh
new file mode 100644
index 000000000..b14622a32
--- /dev/null
+++ b/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run.sh
@@ -0,0 +1,66 @@
+## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md
+## and follow the steps of installation/data downloading.
+checkpoint_paths=(
+    /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/
+    /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/
+)
+
+## No need to use the exact training config json, just use this dummy is fine
+config_path=ds_config_eval_dummy.json
+username=$(whoami)
+result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results"
+
+## Task(s) on the same row will be performed together in the same process.
+## There exist other tasks that can run but we skip because they didn't appear
+## or have strange scores in GPT-3 paper: qqp, prost, cb, wic, mrpc, sst, wnli
+## pubmedqa, logiqa, qnli, sciq, mc_taco, mathqa. For wikitext, it didn't
+## appear in paper but we include it for a perplexity task.
+tasks=(
+    record
+    triviaqa
+    hellaswag
+    arc_challenge
+    arc_easy
+    race
+    multirc
+    openbookqa
+    lambada
+    webqs
+    winogrande
+    piqa
+    anli_r1,anli_r2,anli_r3
+    boolq,copa
+    rte,wsc
+    wikitext
+)
+
+## Use localhost if you didn't setup hostfile as described in
+## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node.
+## If hostfile exist, use hostname (e.g., worker-0) in hostfile.
+# hostname="localhost"
+hostname="worker-0"
+
+batch_size=32
+
+## This script is for zero-shot
+num_fewshot=0
+
+num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+cuda_id=-1
+total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+)
+
+## Code below only works when you run each evalharness task on a single GPU.
+## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh
+for l in "${!checkpoint_paths[@]}"; do 
+    checkpoint_path=${checkpoint_paths[l]}
+    for ((i=0;i<${#tasks[@]};++i)); do
+        task=${tasks[i]}
+        free_mem=0
+        while [ $free_mem -lt $total_mem ]; do
+            cuda_id=$(((cuda_id+1)%num_gpus))
+            free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+)
+            sleep 60s
+        done
+        bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot &
+    done
+done
diff --git a/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh b/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh
new file mode 100644
index 000000000..208de033f
--- /dev/null
+++ b/examples/data_efficiency/gpt/eval/ds_evalharness_parallel_run_10shot.sh
@@ -0,0 +1,61 @@
+## CAUTION: first read Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md
+## and follow the steps of installation/data downloading.
+checkpoint_paths=(
+    /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup375M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234-bwup4B/global_step591581/
+    /vc_data_blob/users/conglli/project/data_efficient_gpt/checkpoint/gpt-pile-0.125B-tok300B-lr6.0e-4-min6.0e-5-wup3000M-dcy260B-sty-cosine-gbs256-mbs4-gpu64-zero0-mp1-pp1-nopp-seed1234/global_step572205/
+)
+
+## No need to use the exact training config json, just use this dummy is fine
+config_path=ds_config_eval_dummy.json
+username=$(whoami)
+result_path="/blob/users/${username}/project/data_efficient_gpt/eval_results_10shot"
+
+## Task(s) on the same row will be performed together in the same process.
+tasks=(
+    record
+    triviaqa
+    hellaswag
+    arc_challenge
+    arc_easy
+    race
+    multirc
+    openbookqa
+    lambada
+    webqs
+    winogrande
+    piqa
+    anli_r1,anli_r2
+    anli_r3
+    boolq,copa
+    rte,wsc
+)
+
+num_fewshot=10
+
+## Use localhost if you didn't setup hostfile as described in
+## https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node.
+## If hostfile exist, use hostname (e.g., worker-0) in hostfile.
+# hostname="localhost"
+hostname="worker-0"
+
+batch_size=16
+
+num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+cuda_id=-1
+total_mem=$(nvidia-smi --query-gpu=memory.total --format=csv -i 0 | grep -Eo [0-9]+)
+
+## Code below only works when you run each evalharness task on a single GPU.
+## For multi-GPU evalharness, check Megatron-DeepSpeed/blob/main/examples/MoE/ds_evalharness.sh
+for l in "${!checkpoint_paths[@]}"; do 
+    checkpoint_path=${checkpoint_paths[l]}
+    for ((i=0;i<${#tasks[@]};++i)); do
+        task=${tasks[i]}
+        free_mem=0
+        while [ $free_mem -lt $total_mem ]; do
+            cuda_id=$(((cuda_id+1)%num_gpus))
+            free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $cuda_id | grep -Eo [0-9]+)
+            sleep 60s
+        done
+        bash ds_evalharness_1gpu.sh $checkpoint_path $config_path $result_path $cuda_id $task $hostname $batch_size $num_fewshot &
+    done
+done
diff --git a/examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json b/examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json
new file mode 100644
index 000000000..a9e3d6116
--- /dev/null
+++ b/examples/data_efficiency/gpt/pretrain/ds_config_gpt_1clmetric_TEMPLATE.json
@@ -0,0 +1,74 @@
+{
+  "train_batch_size": GBSIZE,
+  "train_micro_batch_size_per_gpu": MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE,
+    "elastic_checkpoint": true
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false,
+  "dataloader_drop_last": true,
+  "data_efficiency": {
+    "enabled": true,
+    "seed": DATA_EFFICIENCY_SEED,
+    "data_routing": {
+      "enabled": LTD_ENABLED,
+      "random_ltd":{
+        "enabled": LTD_ENABLED,
+        "total_layer_num": 24,
+        "random_ltd_layer_num": 22,
+        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
+        "model_mask_name": "attention_mask",
+        "model_type": "decoder",
+        "hidden_state_order": "seq_batch_dim",
+        "random_ltd_schedule": {
+          "min_value": LTD_MIN,
+          "max_value": LTD_MAX,
+          "schedule_type":"fixed_linear",
+          "schedule_config": {
+            "require_steps": LTD_STEP,
+            "seq_per_step": 16
+          }
+        }
+      } 
+    },
+    "data_sampling": {
+      "enabled": CL_ENABLED,
+      "num_workers": DATA_SAMPLING_NUM_WORKERS,
+      "curriculum_learning": {
+        "enabled": CL_ENABLED,
+        "data_cluster_path": "CL_CLUSTER_PATH",
+        "curriculum_metrics": {
+          "CL_1st_METRIC_NAME": {
+            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
+            "index_to_metric_path": "CL_1st_METRIC_PATH",
+            "difficulty_type": "CL_1st_DIFF_TYPE",
+            "clustering_type": "CL_1st_CLUSTER_TYPE",
+            "min_difficulty": CL_1st_MIN,
+            "max_difficulty": CL_1st_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_1st_TOTAL_STEP,
+              "difficulty_step": CL_1st_DIFF_STEP,
+              "root_degree": CL_1st_ROOT
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json b/examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json
new file mode 100644
index 000000000..3209f34b0
--- /dev/null
+++ b/examples/data_efficiency/gpt/pretrain/ds_config_gpt_2clmetrics_TEMPLATE.json
@@ -0,0 +1,88 @@
+{
+  "train_batch_size": GBSIZE,
+  "train_micro_batch_size_per_gpu": MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE,
+    "elastic_checkpoint": true
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false,
+  "dataloader_drop_last": true,
+  "data_efficiency": {
+    "enabled": true,
+    "seed": DATA_EFFICIENCY_SEED,
+    "data_routing": {
+      "enabled": LTD_ENABLED,
+      "random_ltd":{
+        "enabled": LTD_ENABLED,
+        "total_layer_num": 24,
+        "random_ltd_layer_num": 22,
+        "random_ltd_layer_id": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22],
+        "model_mask_name": "attention_mask",
+        "model_type": "decoder",
+        "hidden_state_order": "seq_batch_dim",
+        "random_ltd_schedule": {
+          "min_value": LTD_MIN,
+          "max_value": LTD_MAX,
+          "schedule_type":"fixed_linear",
+          "schedule_config": {
+            "require_steps": LTD_STEP,
+            "seq_per_step": 16
+          }
+        }
+      } 
+    },
+    "data_sampling": {
+      "enabled": CL_ENABLED,
+      "num_workers": DATA_SAMPLING_NUM_WORKERS,
+      "curriculum_learning": {
+        "enabled": CL_ENABLED,
+        "data_cluster_path": "CL_CLUSTER_PATH",
+        "curriculum_metrics": {
+          "CL_1st_METRIC_NAME": {
+            "index_to_sample_path": "CL_1st_SAMPLE_PATH",
+            "index_to_metric_path": "CL_1st_METRIC_PATH",
+            "difficulty_type": "CL_1st_DIFF_TYPE",
+            "clustering_type": "CL_1st_CLUSTER_TYPE",
+            "min_difficulty": CL_1st_MIN,
+            "max_difficulty": CL_1st_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_1st_TOTAL_STEP,
+              "difficulty_step": CL_1st_DIFF_STEP,
+              "root_degree": CL_1st_ROOT
+            }
+          },
+          "CL_2nd_METRIC_NAME": {
+            "index_to_sample_path": "CL_2nd_SAMPLE_PATH",
+            "index_to_metric_path": "CL_2nd_METRIC_PATH",
+            "difficulty_type": "CL_2nd_DIFF_TYPE",
+            "clustering_type": "CL_2nd_CLUSTER_TYPE",
+            "min_difficulty": CL_2nd_MIN,
+            "max_difficulty": CL_2nd_MAX,
+            "schedule_type": "fixed_root",
+            "schedule_config": {
+              "total_curriculum_step": CL_2nd_TOTAL_STEP,
+              "difficulty_step": CL_2nd_DIFF_STEP,
+              "root_degree": CL_2nd_ROOT
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh b/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh
new file mode 100644
index 000000000..334989487
--- /dev/null
+++ b/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_base_script.sh
@@ -0,0 +1,515 @@
+#!/bin/bash
+dir=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+seq_len=2048
+
+## The "GPT-3 XXX" below are configs from GPT-3 paper
+## https://arxiv.org/abs/2005.14165, choose based on
+## your desired model size or build your own configs
+
+## init_std is standard deviation for weight initialization. Usually larger
+## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size)
+## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
+
+## We changed min_lr to a lower number (1.0e-6), which we found is able to
+## provide better zero-shot eval results.
+
+## GPT-3 Small 125M
+# model_size=0.125
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+# global_batch_size=256
+# lr=6.0e-4
+# min_lr=1.0e-6
+# init_std=0.02
+
+## GPT-3 Medium 350M
+# model_size=0.35
+# num_layers=24
+# hidden_size=1024
+# num_attn_heads=16
+# global_batch_size=256
+# lr=3.0e-4
+# min_lr=1.0e-6
+# init_std=0.018
+
+## GPT-3 Large 760M
+# model_size=0.76
+# num_layers=24
+# hidden_size=1536
+# num_attn_heads=16
+# global_batch_size=256
+# lr=2.5e-4
+# min_lr=1.0e-6
+# init_std=0.015
+
+## GPT-3 XL 1.3B
+model_size=1.3
+num_layers=24
+hidden_size=2048
+num_attn_heads=16
+global_batch_size=512
+# lr=2.0e-4
+lr=$1
+min_lr=1.0e-6
+init_std=0.013
+
+## GPT-3 2.7B
+# model_size=2.7
+# num_layers=32
+# hidden_size=2560
+# num_attn_heads=32
+# global_batch_size=512
+# lr=1.6e-4
+# min_lr=1.0e-6
+# init_std=0.011
+
+## GPT-3 6.7B
+# model_size=6.7
+# num_layers=32
+# hidden_size=4096
+# num_attn_heads=32
+# global_batch_size=1024
+# lr=1.2e-4
+# min_lr=1.0e-6
+# init_std=0.009
+
+## GPT-3 13B
+# model_size=13
+# num_layers=40
+# hidden_size=5120
+# num_attn_heads=40
+# global_batch_size=1024
+# lr=1.0e-4
+# min_lr=1.0e-6
+# init_std=0.008
+
+## GPT-3 175B
+# model_size=175
+# num_layers=96
+# hidden_size=12288
+# num_attn_heads=96
+# global_batch_size=1536
+# lr=0.6e-4
+# min_lr=1.0e-6
+# init_std=0.005
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens.
+# train_tokens_in_billion=300
+train_tokens_in_billion=$2
+train_tokens=$((${train_tokens_in_billion} * 1000000000))
+
+## train_samples is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the train_tokens
+## above, and data efficiency techniques may change num tokens in some samples,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by train_samples.
+train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} ))
+
+## Another wall-clock time termination condition in minutes. Set it large
+## enough to avoid undesired early termination.
+exit_duration=30000000
+###############################################################################
+### lr configs
+## lr warmup and decay duration.
+## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens.
+## Here we increase the warmup tokens to 3B since when batch size warmup is not
+## used, there are more tokens per step. Thus we need to increase warmup tokens
+## to make sure there are enough warmup steps, which is important for training
+## stability.
+lr_warmup_tokens_in_million=3000
+lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000))
+## Here we changed the LR decay tokens to align with total train tokens, since
+## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the
+## learning rate schedule to match the number of training tokens results in the
+## best final model quality 
+lr_decay_tokens_in_billion=${train_tokens_in_billion}
+lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000))
+lr_decay_style="cosine"
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Note that currently both curriculum learning and random-LTD are NOT
+## compatible with pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO-based data parallelism, stage=0 will disable ZeRO
+zero_stage=1
+
+## Total number of GPUs. ds_ssh is from DeepSpeed library.
+num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+num_node=$(( ${num_gpus} / ${num_gpus_pernode} ))
+
+## Data parallel size.
+dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} ))
+
+## Micro batch size per GPU
+## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus
+## Reduce it manually if GPU OOM
+batch_size=$(( ${global_batch_size} / ${dp_size} ))
+###############################################################################
+### Random layerwise token dropping (random-LTD) configs
+## random-LTD's main switch. "false" means disabled. "true" means enabled.
+ltd_enabled=${3:-'false'}
+## How much dropping ratio to start with. The value denotes the seqlen after
+## dropping.
+ltd_start=${4:-2048}
+## How many steps for random-LTD to gradually reduce dropping ratio to zero.
+ltd_step=${5:-1}
+
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step=200000
+###############################################################################
+### Curriculum learning (CL) configs
+## CL's main switch. "false" means disabled. "true" means enabled.
+cl_enabled=${6:-'false'}
+## Number of CL metrics to use.
+cl_num_metric=${7:-1}
+
+## Name of difficulty metric
+cl_1st_metric=${8:-'dummy'}
+## Path to the data indexes for this difficulty metric. Samples on ith row of
+## index_to_sample have the difficulty value equals to ith row of
+## index_to_metric.
+cl_1st_index_to_sample_path=${9:-'dummy'}
+cl_1st_index_to_metric_path=${10:-'dummy'}
+## During training, whether increase difficulty by value- or percentile-based.
+cl_1st_difficulty_type=${11:-'value'}
+## "single_cluster" means no clustering required and probably CL is achieved by
+## data postprocessing. "schedule_based" means will cluster data based on the
+## difficulty schedule (pacing function) below.
+cl_1st_clustering_type=${12:-'single_cluster'}
+## Start difficulty
+cl_1st_min=${13:-2048}
+## End difficulty
+cl_1st_max=${14:-2048}
+## Total step to reach end difficulty
+cl_1st_total_step=${15:-1}
+## When changing difficulty, always make sure it's a multiple of the
+## difficulty_step below.
+cl_1st_difficulty_step=${16:-1}
+## Root degree of the schedule (pacing function).
+cl_1st_root=${17:-1}
+
+cl_2nd_metric=${18:-'dummy'}
+cl_2nd_index_to_sample_path=${19:-'dummy'}
+cl_2nd_index_to_metric_path=${20:-'dummy'}
+cl_2nd_difficulty_type=${21:-'value'}
+cl_2nd_clustering_type=${22:-'single_cluster'}
+cl_2nd_min=${23:-2048}
+cl_2nd_max=${24:-2048}
+cl_2nd_total_step=${25:-1}
+cl_2nd_difficulty_step=${26:-1}
+cl_2nd_root=${27:-1}
+
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# ## The *_index_to_sample_percentile_merged is a concatenated index for perf
+# ## improvement, but it only works when you set difficulty_type="percentile" in
+# ## ds_config. If you use difficulty_type="value", you need to change this to
+# ## *_index_to_sample
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=1
+# cl_1st_max=100
+# cl_1st_total_step=110000
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=80
+# cl_2nd_max=2048
+# cl_2nd_total_step=110000
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+###############################################################################
+### Misc configs
+log_interval=100
+eval_iters=10
+eval_interval=100
+# num_save controls how frequent to save checkpoint. num_save=20 means that a
+# checkpoint will be saved every 5% of training. For longer training you would
+# want larger num_save to save more frequently, and vice versa.
+num_save=100
+estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size}))
+save_interval=$((${estimated_train_iter} / ${num_save}))
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+activation_checkpoint="true"
+# activation_checkpoint="false"
+
+## Whether or not log optimizer states (norms, max abs values) to tensorboard.
+## This is not required for training and might save GPU memory when turned off.
+log_optimizer_state="true"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d_%H.%M.%S")
+host="${HOSTNAME}"
+seed=1234
+num_workers=0
+
+## Public the Pile dataset, can be downloaded at
+## https://mystic.the-eye.eu/public/AI/pile_neox/ Change data_home to where you
+## store the pile_text_document.bin and pile_text_document.idx.
+data_home="/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing"
+if [[ "$host" == *"webxt"* ]]; then
+    data_home="/blob/data/the_pile_public_merged_nopreprocessing"
+fi
+data_path="${data_home}/pile_text_document"
+## *_idx_path force Megatron to use a specific data index file generated when
+## we analyze data. This is needed because our index for curriculum learning
+## difficulty metric is based on this data index.
+doc_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_doc_idx.npy"
+sample_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_sample_idx.npy"
+shuffle_idx_path="${data_home}/pile_text_document_train_indexmap_exact1ep_2048sl_1234s_shuffle_idx.npy"
+
+vocab_path="gpt2-vocab.json"
+if [ ! -f "$vocab_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+fi
+merge_path="gpt2-merges.txt"
+if [ ! -f "$merge_path" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+fi
+
+prescale_grad="true"
+jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
+jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"
+jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}"
+if [[ $zero_stage -gt 0 ]]; then
+    jobname="${jobname}_z${zero_stage}"
+    prescale_grad="false"
+fi
+if [[ $mp_size -gt 1 ]]; then
+    jobname="${jobname}_mp${mp_size}"
+fi
+if [ "${no_pp}" = "false" ]; then
+    jobname="${jobname}_pp${pp_size}"
+fi
+jobname="${jobname}_seed${seed}"
+if [ "${ltd_enabled}" = "true" ]; then
+    jobname="${jobname}_ltd_${ltd_start}_${ltd_step}"
+fi
+if [ "${cl_enabled}" = "true" ]; then
+    jobname="${jobname}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}"
+    if [[ $cl_num_metric -gt 1 ]]; then
+        jobname="${jobname}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}"
+    fi
+fi
+
+username=$(whoami)
+output_home="/blob/users/${username}/project/data_efficient_gpt"
+log_path="${output_home}/log/"
+checkpoint_path="${output_home}/checkpoint/${jobname}"
+## Microsoft internal constraint: because tensorboard is logged by last rank,
+## it's better to put the path in NFS instead of Blob.
+tensorboard_dir="/vc_data/users/${username}/project/data_efficient_gpt/tensorboard/"
+tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}"
+mkdir -p ${log_path}
+mkdir -p ${checkpoint_path}
+mkdir -p ${tensorboard_path}
+if [ "${cl_enabled}" = "true" ]; then
+    data_cluster_path="${output_home}/data_cluster/${jobname}"
+    mkdir -p ${data_cluster_path}
+fi
+###############################################################################
+data_options=" \
+    --vocab-file ${vocab_path} \
+    --merge-file ${merge_path} \
+    --data-path ${data_path} \
+    --data-impl mmap"
+
+## If CL is used, make sure to set "--split" the same as what you used during
+## offline data analysis&indexing.
+megatron_options=" \
+    --override-lr-scheduler \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --tensor-model-parallel-size ${mp_size} \
+    --init-method-std ${init_std} \
+    --lr-decay-tokens ${lr_decay_tokens} \
+    --lr-warmup-tokens ${lr_warmup_tokens} \
+    --micro-batch-size ${batch_size} \
+    --exit-duration-in-mins ${exit_duration} \
+    --global-batch-size ${global_batch_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --train-tokens ${train_tokens} \
+    --train-samples ${train_samples} \
+    --lr ${lr} \
+    --min-lr ${min_lr} \
+    --lr-decay-style ${lr_decay_style} \
+    --split 949,50,1 \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --save-interval ${save_interval} \
+    --weight-decay 0.1 \
+    --clip-grad 1.0 \
+    --hysteresis 2 \
+    --num-workers ${num_workers} \
+    --fp16 \
+    --seed ${seed} \
+    --load ${checkpoint_path} \
+    --save ${checkpoint_path} \
+    --tensorboard-queue-size 1 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    --tensorboard-dir ${tensorboard_path}"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --checkpoint-activations"
+fi
+
+if [ "${log_optimizer_state}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --log-optimizer-states-to-tensorboard"
+fi
+
+if [ "${ltd_enabled}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --random-ltd"
+fi
+
+if [ "${cl_enabled}" = "true" ]; then
+megatron_options="${megatron_options} \
+    --train-doc-idx-path ${doc_idx_path} \
+    --train-sample-idx-path ${sample_idx_path} \
+    --train-shuffle-idx-path ${shuffle_idx_path} \
+    --data-efficiency-curriculum-learning"
+fi
+
+config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}_seed${seed}"
+if [ "${ltd_enabled}" = "true" ]; then
+    config_json="${config_json}_ltd_${ltd_start}_${ltd_step}"
+fi
+if [ "${cl_enabled}" = "true" ]; then
+    config_json="${config_json}_cl_${cl_1st_metric}_${cl_1st_min}_${cl_1st_max}_${cl_1st_total_step}_${cl_1st_root}"
+    if [[ $cl_num_metric -gt 1 ]]; then
+        config_json="${config_json}_${cl_2nd_metric}_${cl_2nd_min}_${cl_2nd_max}_${cl_2nd_total_step}_${cl_2nd_root}"
+    fi
+fi
+config_json="${config_json}.json"
+if [[ $cl_num_metric -gt 1 ]]; then
+template_json="ds_config_gpt_2clmetrics_TEMPLATE.json"
+sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
+    | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \
+    | sed "s/LTD_ENABLED/${ltd_enabled}/" \
+    | sed "s/LTD_MIN/${ltd_start}/" \
+    | sed "s/LTD_MAX/${seq_len}/" \
+    | sed "s/LTD_STEP/${ltd_step}/" \
+    | sed "s/CL_ENABLED/${cl_enabled}/" \
+    | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \
+    | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \
+    | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \
+    | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \
+    | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \
+    | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \
+    | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \
+    | sed "s/CL_1st_MIN/${cl_1st_min}/" \
+    | sed "s/CL_1st_MAX/${cl_1st_max}/" \
+    | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \
+    | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \
+    | sed "s/CL_1st_ROOT/${cl_1st_root}/" \
+    | sed "s#CL_2nd_METRIC_NAME#${cl_2nd_metric}#" \
+    | sed "s#CL_2nd_SAMPLE_PATH#${cl_2nd_index_to_sample_path}#" \
+    | sed "s#CL_2nd_METRIC_PATH#${cl_2nd_index_to_metric_path}#" \
+    | sed "s#CL_2nd_DIFF_TYPE#${cl_2nd_difficulty_type}#" \
+    | sed "s#CL_2nd_CLUSTER_TYPE#${cl_2nd_clustering_type}#" \
+    | sed "s/CL_2nd_MIN/${cl_2nd_min}/" \
+    | sed "s/CL_2nd_MAX/${cl_2nd_max}/" \
+    | sed "s/CL_2nd_TOTAL_STEP/${cl_2nd_total_step}/" \
+    | sed "s/CL_2nd_DIFF_STEP/${cl_2nd_difficulty_step}/" \
+    | sed "s/CL_2nd_ROOT/${cl_2nd_root}/" \
+      > ${config_json}
+else
+template_json="ds_config_gpt_1clmetric_TEMPLATE.json"
+sed "s/GBSIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/${prescale_grad}/" \
+    | sed "s/DATA_EFFICIENCY_SEED/${seed}/" \
+    | sed "s/LTD_ENABLED/${ltd_enabled}/" \
+    | sed "s/LTD_MIN/${ltd_start}/" \
+    | sed "s/LTD_MAX/${seq_len}/" \
+    | sed "s/LTD_STEP/${ltd_step}/" \
+    | sed "s/CL_ENABLED/${cl_enabled}/" \
+    | sed "s/DATA_SAMPLING_NUM_WORKERS/${num_workers}/" \
+    | sed "s#CL_CLUSTER_PATH#${data_cluster_path}#" \
+    | sed "s#CL_1st_METRIC_NAME#${cl_1st_metric}#" \
+    | sed "s#CL_1st_SAMPLE_PATH#${cl_1st_index_to_sample_path}#" \
+    | sed "s#CL_1st_METRIC_PATH#${cl_1st_index_to_metric_path}#" \
+    | sed "s#CL_1st_DIFF_TYPE#${cl_1st_difficulty_type}#" \
+    | sed "s#CL_1st_CLUSTER_TYPE#${cl_1st_clustering_type}#" \
+    | sed "s/CL_1st_MIN/${cl_1st_min}/" \
+    | sed "s/CL_1st_MAX/${cl_1st_max}/" \
+    | sed "s/CL_1st_TOTAL_STEP/${cl_1st_total_step}/" \
+    | sed "s/CL_1st_DIFF_STEP/${cl_1st_difficulty_step}/" \
+    | sed "s/CL_1st_ROOT/${cl_1st_root}/" \
+      > ${config_json}
+fi
+
+deepspeed_options=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --pipeline-model-parallel-size ${pp_size}"
+
+if [[ "${no_pp}" = "true" ]]; then
+deepspeed_options="${deepspeed_options} \
+    --no-pipeline-parallel"
+fi
+
+if [ "${activation_checkpoint}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+    --deepspeed-activation-checkpointing"
+fi
+
+## When saving checkpoint to a storage with cache, their could be consistency
+## issue of the pointer to latest checkpoint. Here we find the correct pointer
+## and broadcast it to all nodes.
+iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt"
+iteration_file_2="$checkpoint_path/latest"
+iteration=0
+for (( node = 0; node <= num_node-1; node++ ))
+do
+    if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then
+        local_iteration=$(ssh -q worker-"$node" cat $iteration_file)
+        iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} :  ${iteration} ))
+    fi
+done
+if [[ $iteration -gt 0 ]]; then
+    iteration_2="global_step${iteration}"
+    ds_ssh "echo $iteration > $iteration_file"
+    ds_ssh "echo $iteration_2 > $iteration_file_2"
+fi
+
+deepspeed ${dir}/../../../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
diff --git a/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh b/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh
new file mode 100644
index 000000000..8878c1792
--- /dev/null
+++ b/examples/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh
@@ -0,0 +1,366 @@
+###############################################################################
+### Each block below is one pretraining setup. Uncomment one block to try.
+###############################################################################
+### Baseline cases, mostly based on OpenAI's GPT-3 hyperparameters, but with
+### some changes (without batch size warmup, and different LR schedule).
+## Baseline 300B tokens (100%):
+# lr=2.0e-4
+# train_tokens_in_billion=300
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion}
+###############################################################################
+## Baseline 200B tokens (67%):
+# lr=3.0e-4 # scaled based on train token reduction ratio
+# train_tokens_in_billion=200
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion}
+###############################################################################
+## Baseline 150B tokens (50%):
+# lr=4.0e-4
+# train_tokens_in_billion=150
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion}
+###############################################################################
+### Curriculum learning (CL) + Random layerwise token dropping (random-LTD).
+### DeepSpeed Data Efficiency's best composed solution.
+## CL+random-LTD 300B tokens (100%):
+# lr=2.0e-4
+# train_tokens_in_billion=300
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step=200000
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=1
+# cl_1st_max=100
+# cl_1st_total_step=110000
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=80
+# cl_2nd_max=2048
+# cl_2nd_total_step=110000
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+## CL+random-LTD 150B tokens (50%):
+# lr=4.0e-4
+# train_tokens_in_billion=150
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step=100000
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=1
+# cl_1st_max=100
+# cl_1st_total_step=55000
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=80
+# cl_2nd_max=2048
+# cl_2nd_total_step=55000
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+### Random layerwise token dropping (random-LTD).
+## random-LTD 300B tokens (100%):
+# lr=2.0e-4
+# train_tokens_in_billion=300
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step=200000
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step}
+###############################################################################
+## random-LTD 200B tokens (67%):
+# lr=3.0e-4
+# train_tokens_in_billion=200
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step=133333
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step}
+###############################################################################
+## random-LTD 150B tokens (50%):
+# lr=4.0e-4
+# train_tokens_in_billion=150
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step=100000
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step}
+###############################################################################
+### Curriculum learning (CL).
+## CL vocab rarity + seqlen truncation 300B tokens (100%):
+# lr=2.0e-4
+# train_tokens_in_billion=300
+# ltd_enabled="false"
+# ltd_start=2048
+# ltd_step=1
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=1
+# cl_1st_max=100
+# cl_1st_total_step=110000
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=80
+# cl_2nd_max=2048
+# cl_2nd_total_step=110000
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+## CL vocab rarity + seqlen truncation 200B tokens (67%):
+# lr=3.0e-4
+# train_tokens_in_billion=200
+# ltd_enabled="false"
+# ltd_start=2048
+# ltd_step=1
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=1
+# cl_1st_max=100
+# cl_1st_total_step=73000
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=80
+# cl_2nd_max=2048
+# cl_2nd_total_step=73000
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+## CL vocab rarity + seqlen truncation 150B tokens (50%):
+# lr=4.0e-4
+# train_tokens_in_billion=150
+# ltd_enabled="false"
+# ltd_start=2048
+# ltd_step=1
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=1
+# cl_1st_max=100
+# cl_1st_total_step=55000
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=80
+# cl_2nd_max=2048
+# cl_2nd_total_step=55000
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+## CL vocab rarity + seqlen reshape 300B tokens (100%):
+# lr=2.0e-4
+# train_tokens_in_billion=300
+# ltd_enabled="false"
+# ltd_start=2048
+# ltd_step=1
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=1
+# cl_1st_max=100
+# cl_1st_total_step=110000
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_reshape"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=80
+# cl_2nd_max=2048
+# cl_2nd_total_step=110000
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+## CL vocab rarity 300B tokens (100%):
+# lr=2.0e-4
+# train_tokens_in_billion=300
+# ltd_enabled="false"
+# ltd_start=2048
+# ltd_step=1
+# cl_enabled="true"
+# cl_num_metric=1
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_gpt_1epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=1
+# cl_1st_max=100
+# cl_1st_total_step=110000
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root}
+###############################################################################
+## CL seqlen truncation 300B tokens (100%):
+# lr=2.0e-4
+# train_tokens_in_billion=300
+# ltd_enabled="false"
+# ltd_start=2048
+# ltd_step=1
+# cl_enabled="true"
+# cl_num_metric=1
+# cl_1st_metric="seqlen_truncate"
+# cl_1st_index_to_sample_path="dummy"
+# cl_1st_index_to_metric_path="dummy"
+# cl_1st_difficulty_type="value"
+# cl_1st_clustering_type="single_cluster"
+# cl_1st_min=80
+# cl_1st_max=2048
+# cl_1st_total_step=110000
+# cl_1st_difficulty_step=8
+# cl_1st_root=1
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root}
+###############################################################################
+## CL seqlen reshape 300B tokens (100%):
+# lr=2.0e-4
+# train_tokens_in_billion=300
+# ltd_enabled="false"
+# ltd_start=2048
+# ltd_step=1
+# cl_enabled="true"
+# cl_num_metric=1
+# cl_1st_metric="seqlen_reshape"
+# cl_1st_index_to_sample_path="dummy"
+# cl_1st_index_to_metric_path="dummy"
+# cl_1st_difficulty_type="value"
+# cl_1st_clustering_type="single_cluster"
+# cl_1st_min=80
+# cl_1st_max=2048
+# cl_1st_total_step=110000
+# cl_1st_difficulty_step=8
+# cl_1st_root=1
+# bash ds_pretrain_gpt_1.3B_dense_base_script.sh ${lr} \
+#     ${train_tokens_in_billion} ${ltd_enabled} ${ltd_start} ${ltd_step} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root}
+###############################################################################
\ No newline at end of file
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 1d82427a3..4c4e00dcd 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -248,7 +248,7 @@ def parse_args(extra_args_provider=None, defaults={},
             'for distribute-checkpointed-activations to work you '\
             'need to enable checkpoint-activations'
 
-    args.curriculum_learning = False
+    args.curriculum_learning_legacy = False
     args.compression_training = False
 
     # AML
@@ -444,6 +444,9 @@ def _add_training_args(parser):
     group.add_argument('--train-tokens', type=int, default=None,
                        help='Total number of tokens to train over all '
                        'training runs.')
+    group.add_argument('--random-ltd',
+                       action='store_true',
+                       help='enable random layer token drop')    
     group.add_argument('--log-interval', type=int, default=100,
                        help='Report loss and timing interval.')
     group.add_argument('--exit-interval', type=int, default=None,
@@ -748,7 +751,21 @@ def _add_data_args(parser):
                        'end-of-document token.')
     group.add_argument('--eod-mask-loss', action='store_true',
                        help='Mask loss for the end of document tokens.')
-
+    group.add_argument('--train-data-exact-num-epochs', type=int, default=None,
+                       help='When building the train dataset, force it to be '
+                       'an exact number of epochs of the raw data')
+    group.add_argument('--return-data-index', action='store_true',
+                       help='Return the index of data sample.')
+    group.add_argument('--data-efficiency-curriculum-learning', action='store_true',
+                       help='Use DeepSpeed data efficiency library curriculum learning feature.')
+    group.add_argument('--train-idx-path', type=str, default=None,
+                       help='Force to use certain index file.')
+    group.add_argument('--train-doc-idx-path', type=str, default=None,
+                       help='Force to use certain index file.')
+    group.add_argument('--train-sample-idx-path', type=str, default=None,
+                       help='Force to use certain index file.')
+    group.add_argument('--train-shuffle-idx-path', type=str, default=None,
+                       help='Force to use certain index file.')
     return parser
 
 
diff --git a/megatron/data/bert_dataset.py b/megatron/data/bert_dataset.py
index 916a3be06..24965f7ca 100644
--- a/megatron/data/bert_dataset.py
+++ b/megatron/data/bert_dataset.py
@@ -73,13 +73,14 @@ def __len__(self):
         return self.samples_mapping.shape[0]
 
     def __getitem__(self, idx):
+        args = get_args()
         start_idx, end_idx, seq_length = self.samples_mapping[idx]
         sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
         # Note that this rng state should be numpy and not python since
         # python randint is inclusive whereas the numpy one is exclusive.
         # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
         np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
-        return build_training_sample(sample, seq_length,
+        train_sample = build_training_sample(sample, seq_length,
                                      self.max_seq_length,  # needed for padding
                                      self.vocab_id_list,
                                      self.vocab_id_to_token_dict,
@@ -87,6 +88,9 @@ def __getitem__(self, idx):
                                      self.mask_id, self.pad_id,
                                      self.masked_lm_prob, np_rng,
                                      self.binary_head)
+        if args.return_data_index:
+            train_sample['index'] = np.array([idx], dtype=np.int64)
+        return train_sample
 
 
 
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 3052e9fdd..cf8ccb9fd 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -224,7 +224,7 @@ def create_masked_lm_predictions(tokens,
 
     if masked_lm_prob == 0:
         return (output_tokens, masked_lm_positions,
-                masked_lm_labels, token_boundary)
+                masked_lm_labels, token_boundary, None)
 
     num_to_predict = min(max_predictions_per_seq,
                          max(1, int(round(len(tokens) * masked_lm_prob))))
@@ -640,27 +640,39 @@ def get_samples_mapping(indexed_dataset,
                         name,
                         binary_head):
     """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""
-
-    if not num_epochs:
-        if not max_num_samples:
-            raise ValueError("Need to specify either max_num_samples "
-                             "or num_epochs")
-        num_epochs = np.iinfo(np.int32).max - 1
-    if not max_num_samples:
+    args = get_args()
+    if args.train_data_exact_num_epochs is not None and name == 'train':
+        num_epochs = args.train_data_exact_num_epochs
         max_num_samples = np.iinfo(np.int64).max - 1
+    else:
+        if not num_epochs:
+            if not max_num_samples:
+                raise ValueError("Need to specify either max_num_samples "
+                                "or num_epochs")
+            num_epochs = np.iinfo(np.int32).max - 1
+        if not max_num_samples:
+            max_num_samples = np.iinfo(np.int64).max - 1
 
     # Filename of the index mapping
     indexmap_filename = data_prefix
     indexmap_filename += '_{}_indexmap'.format(name)
-    if num_epochs != (np.iinfo(np.int32).max - 1):
-        indexmap_filename += '_{}ep'.format(num_epochs)
-    if max_num_samples != (np.iinfo(np.int64).max - 1):
-        indexmap_filename += '_{}mns'.format(max_num_samples)
+    if args.train_data_exact_num_epochs is not None and name == 'train':
+        indexmap_filename += '_exact{}ep'.format(num_epochs)
+    else:
+        if num_epochs != (np.iinfo(np.int32).max - 1):
+            indexmap_filename += '_{}ep'.format(num_epochs)
+        if max_num_samples != (np.iinfo(np.int64).max - 1):
+            indexmap_filename += '_{}mns'.format(max_num_samples)
     indexmap_filename += '_{}msl'.format(max_seq_length)
     indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
     indexmap_filename += '_{}s'.format(seed)
     indexmap_filename += '.npy'
 
+    if name == 'train':
+        # force to use certain index files
+        if args.train_idx_path is not None:
+            indexmap_filename = args.train_idx_path
+
     # Build the indexed mapping if not exist.
     if torch.distributed.get_rank() == 0 and \
        not os.path.isfile(indexmap_filename):
@@ -699,12 +711,13 @@ def get_samples_mapping(indexed_dataset,
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
     # parallel case
-    counts = torch.cuda.LongTensor([1])
-    torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
-    torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
-    assert counts[0].item() == (
-        torch.distributed.get_world_size() //
-        torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
+    if torch.cuda.device_count() > 0: # Skip when CPU-only
+        counts = torch.cuda.LongTensor([1])
+        torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
+        torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
+        assert counts[0].item() == (
+            torch.distributed.get_world_size() //
+            torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group()))
 
     # Load indexed dataset.
     print_rank_0(' > loading indexed mapping from {}'.format(
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 0c9058454..5625f4586 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -21,7 +21,7 @@
 import numpy as np
 import torch
 
-from megatron import mpu, is_rank_0, print_rank_0
+from megatron import mpu, is_rank_0, print_rank_0, get_args
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
 from megatron.data.dataset_utils import get_train_valid_test_split_
@@ -159,6 +159,8 @@ def __len__(self):
         return self.sample_idx.shape[0] - 1
 
     def __getitem__(self, idx):
+        args = get_args()
+        orig_idx = idx
         # Get the shuffled index.
         idx = self.shuffle_idx[idx]
         # Start and end documents and offsets.
@@ -183,7 +185,8 @@ def __getitem__(self, idx):
                 self.doc_idx[doc_index_l],
                 length=offset_l + 1))
             sample = np.concatenate(sample_list)
-
+        if args.return_data_index:
+            return {'text': np.array(sample, dtype=np.int64), 'index': np.array([orig_idx], dtype=np.int64)}
         return {'text': np.array(sample, dtype=np.int64)}
 
 
@@ -195,6 +198,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
        training sample.
     shuffle-idx: maps the sample index into a random index into sample-idx.
     """
+    args = get_args()
     # Number of tokens in each epoch and number of required epochs.
     tokens_per_epoch = _num_tokens(documents, sizes)
     num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
@@ -204,13 +208,26 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     # Filename of the index mappings.
     _filename = data_prefix
     _filename += '_{}_indexmap'.format(name)
-    _filename += '_{}ns'.format(num_samples)
+    if args.train_data_exact_num_epochs is not None and name == 'train':
+        num_epochs = args.train_data_exact_num_epochs
+        _filename += '_exact{}ep'.format(num_epochs)
+    else:
+        _filename += '_{}ns'.format(num_samples)
     _filename += '_{}sl'.format(seq_length)
     _filename += '_{}s'.format(seed)
     doc_idx_filename = _filename + '_doc_idx.npy'
     sample_idx_filename = _filename + '_sample_idx.npy'
     shuffle_idx_filename = _filename + '_shuffle_idx.npy'
 
+    if name == 'train':
+        # force to use certain index files
+        if args.train_doc_idx_path is not None:
+            doc_idx_filename = args.train_doc_idx_path
+        if args.train_sample_idx_path is not None:
+            sample_idx_filename = args.train_sample_idx_path
+        if args.train_shuffle_idx_path is not None:
+            shuffle_idx_filename = args.train_shuffle_idx_path
+
     # Build the indexed mapping if not exist.
     if is_rank_0():
         if (not os.path.isfile(doc_idx_filename)) or \
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 014bfa073..5ed7dc089 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -134,7 +134,8 @@ def _compile_dependencies():
     if _is_rank_0():
         start_time = time.time()
         print('> compiling and loading fused kernels ...', flush=True)
-        fused_kernels.load(args)
+        if torch.cuda.device_count() > 0: # Skip when CPU-only
+            fused_kernels.load(args)
         torch.distributed.barrier()
     else:
         torch.distributed.barrier()
@@ -205,7 +206,7 @@ def _initialize_distributed():
             else:
                 args.local_rank = device
 
-        torch.cuda.set_device(device) 
+            torch.cuda.set_device(device) # only do so when device_count > 0
 
         # Call the init process
         init_method = 'tcp://'
@@ -247,7 +248,11 @@ def _set_random_seed(seed_):
     """Set random seed for reproducability."""
     if seed_ is not None and seed_ > 0:
         # Ensure that different pipeline MP stages get different seeds.
-        seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
+        # No need to do so for CPU-only case.
+        if torch.cuda.device_count() == 0:
+            seed = seed_
+        else:
+            seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
index f31d1a12c..62b951064 100644
--- a/megatron/model/gpt_model.py
+++ b/megatron/model/gpt_model.py
@@ -114,7 +114,7 @@ def forward(self, input_ids, position_ids, attention_mask, labels=None,
                 # attention_mask has size [1, 1, seqlen, seqlen]
                 attention_mask = attention_mask[:, :, :curriculum_seqlen, :curriculum_seqlen].contiguous()
         else:
-            if args.curriculum_learning:
+            if args.curriculum_learning_legacy:
                 # If got a None input, need to reset curriculum_seqlen on user side
                 args.curriculum_seqlen = args.seq_length
 
diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py
index 2ca337c8e..2c6802a74 100644
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -427,7 +427,7 @@ def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
     def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                        keep_vars=False):
         """For easy load."""
-
+        args = get_args()
         state_dict_ = {}
         moe_state_dict = {}
         if self.pre_process:
@@ -436,6 +436,15 @@ def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                     destination, prefix, keep_vars)
         encoder_state_dict = self.encoder.state_dict_for_save_checkpoint(
                 destination, prefix, keep_vars)
+        if args.random_ltd:
+            # When using random-LTD, it is required to call remove_random_ltd_state_dict
+            # during model checkpoint saving to transfer the random-LTD-wrapped
+            # layers back to original layers. This will help to remove the dependency
+            # to random-LTD inside the checkpoint, so that during evaluation or
+            # finetuning of the checkpoint there is no need to depend on random-LTD
+            # again.
+            from deepspeed.runtime.data_pipeline.data_routing.helper import remove_random_ltd_state_dict
+            encoder_state_dict = remove_random_ltd_state_dict(encoder_state_dict)
         # MoE states need to be handled separately by DeepSpeed engine, thus
         # moving them to the top level dictionary
         # If components other than encoder may contain MoE states, need to add
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 7b1fd0e78..24b46d5c7 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -464,7 +464,7 @@ def __init__(self, init_method, output_layer_init_method,
                             drop_tokens=args.moe_token_dropping, use_tutel=args.use_tutel,
                             enable_expert_tensor_parallelism=enable_expert_tensor_parallelism) 
 
-    def forward(self, hidden_states, attention_mask,
+    def forward(self, hidden_states, attention_mask=None,
                 encoder_output=None, enc_dec_attn_mask=None,
                 layer_past=None, get_key_value=False):
         # hidden_states: [b, s, h]
@@ -706,7 +706,7 @@ def custom_forward(*inputs):
                 moe_losses = []
                 for index in range(start, end):
                     layer = self._get_layer(index)
-                    x_, moe_loss = layer(x_, attention_mask, encoder_output, enc_dec_attn_mask)
+                    x_, moe_loss = layer(x_, attention_mask=attention_mask, encoder_output=encoder_output, enc_dec_attn_mask=enc_dec_attn_mask)
                     moe_losses.append(moe_loss)
                 return (x_, *moe_losses)
             return custom_forward
@@ -779,7 +779,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None,
                 if layer_past is not None:
                     past = layer_past[index]
                 hidden_states = layer(hidden_states,
-                                      attention_mask,
+                                      attention_mask=attention_mask,
                                       encoder_output=encoder_output,
                                       enc_dec_attn_mask=enc_dec_attn_mask,
                                       layer_past=past,
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index 492f1f10b..7f7c37fb3 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -41,7 +41,7 @@ def init_(tensor):
 
 def attention_mask_func(attention_scores, attention_mask):
     args = get_args()
-    if args.curriculum_learning:
+    if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
         attention_mask_ = attention_mask
         actual_seqlen = attention_scores.size()[2]
         if actual_seqlen != attention_mask_.size()[2]:
diff --git a/megatron/training.py b/megatron/training.py
index f02df5ba2..92d4ba2a8 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -57,6 +57,8 @@
 from deepspeed.compression.compress import init_compression, redundancy_clean
 
 
+from megatron.model.transformer import  ParallelTransformerLayer
+from deepspeed.runtime.data_pipeline.data_routing.helper import convert_to_random_ltd
 
 def print_datetime(string):
     """Note that this call will sync across all ranks."""
@@ -69,7 +71,8 @@ def pretrain(train_valid_test_dataset_provider,
              model_provider,
              forward_step_func,
              extra_args_provider=None,
-             args_defaults={}):
+             args_defaults={},
+             data_post_process=None):
     """Main training program.
 
     This function will run the followings in the order provided:
@@ -118,9 +121,9 @@ def pretrain(train_valid_test_dataset_provider,
             open(args.deepspeed_config, 'r', encoding='utf-8'))
         if "curriculum_learning" in args.deepspeed_configuration and \
             "enabled" in args.deepspeed_configuration["curriculum_learning"]:
-            args.curriculum_learning = args.deepspeed_configuration[ \
+            args.curriculum_learning_legacy = args.deepspeed_configuration[ \
                 "curriculum_learning"]["enabled"]
-        if args.curriculum_learning and not args.no_pipeline_parallel:
+        if args.curriculum_learning_legacy and not args.no_pipeline_parallel:
             from deepspeed.runtime.data_pipeline.curriculum_scheduler \
                 import CurriculumScheduler
             args.curriculum_scheduler = CurriculumScheduler( \
@@ -130,7 +133,9 @@ def pretrain(train_valid_test_dataset_provider,
 
     # Model, optimizer, and learning rate.
     timers('model-and-optimizer-setup').start()
-    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider, teacher=False)
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(
+        model_provider, teacher=False, data_post_process=data_post_process,
+        build_train_valid_test_datasets_provider=train_valid_test_dataset_provider)
     timers('model-and-optimizer-setup').stop()
     print_datetime('after model, optimizer, and learning rate '
                    'scheduler are built')
@@ -149,6 +154,18 @@ def pretrain(train_valid_test_dataset_provider,
         train_data_iterator, valid_data_iterator, test_data_iterator \
             = build_train_valid_test_data_iterators(
                 train_valid_test_dataset_provider)
+    if args.data_efficiency_curriculum_learning:
+        if args.deepspeed_dataloader is not None:
+            # We use args to pass the deepspeed_dataloader because adding
+            # output to setup_model_and_optimizer will break the API for other
+            # cases. We clear args.deepspeed_dataloader after updating
+            # train_data_iterator because args will be saved in checkpoint and
+            # attempting to save the whole deepspeed_dataloader will lead to
+            # "AttributeError: Can't pickle local object...".
+            train_data_iterator = iter(args.deepspeed_dataloader)
+            args.deepspeed_dataloader = None
+        else:
+            train_data_iterator = None
     timers('train/valid/test-data-iterators-setup').stop()
     print_datetime('after dataloaders are built')
 
@@ -411,7 +428,8 @@ def load_model_weights_only(model_provider_func):
 
     return model, optimizer, lr_scheduler
 
-def setup_model_and_optimizer(model_provider_func, teacher=False):
+def setup_model_and_optimizer(model_provider_func, teacher=False,
+    data_post_process=None, build_train_valid_test_datasets_provider=None):
     """Setup model and optimizer."""
     args = get_args()
 
@@ -461,13 +479,47 @@ def setup_model_and_optimizer(model_provider_func, teacher=False):
     if args.deepspeed:
         print_rank_0("DeepSpeed is enabled.")
         pp = mpu.get_pipeline_model_parallel_world_size()
-        model, optimizer, _, lr_scheduler = deepspeed.initialize(
-            model=model[0],
-            optimizer=optimizer,
-            args=args,
-            lr_scheduler=lr_scheduler,
-            mpu=mpu if args.no_pipeline_parallel else None
-        )
+        if args.data_efficiency_curriculum_learning and build_train_valid_test_datasets_provider is not None:
+            train_ds = None
+            # Only need to build dataset on tp rank 0 since Megatron has the
+            # broadcast_data() function that broadcast data from tp rank 0.
+            if mpu.get_tensor_model_parallel_rank() == 0:
+                # Number of train/valid/test samples.
+                if args.train_samples:
+                    train_samples = args.train_samples
+                else:
+                    train_samples = args.train_iters * args.global_batch_size
+                # eval_iters and test_iters here are not actually used, only for
+                # satisfying the input of build_train_valid_test_datasets_provider.
+                # We only need to build the training data here. And we follow
+                # baseline's logic to build eval/test dataset later in
+                # build_train_valid_test_data_iterators.
+                eval_iters = (args.train_iters // args.eval_interval + 1) * \
+                            args.eval_iters
+                test_iters = args.eval_iters
+                train_val_test_num_samples = [train_samples,
+                                            eval_iters * args.global_batch_size,
+                                            test_iters * args.global_batch_size]
+                # Build the datasets.
+                train_ds, _, _ = build_train_valid_test_datasets_provider(
+                    train_val_test_num_samples)
+            model, optimizer, args.deepspeed_dataloader, lr_scheduler = deepspeed.initialize(
+                model=model[0],
+                optimizer=optimizer,
+                args=args,
+                lr_scheduler=lr_scheduler,
+                training_data=train_ds,
+                mpu=mpu if args.no_pipeline_parallel else None
+            )
+            model.set_data_post_process_func(data_post_process)
+        else:
+            model, optimizer, _, lr_scheduler = deepspeed.initialize(
+                model=model[0],
+                optimizer=optimizer,
+                args=args,
+                lr_scheduler=lr_scheduler,
+                mpu=mpu if args.no_pipeline_parallel else None
+            )
         if isinstance(model, deepspeed.PipelineEngine):
             # hack to get batch_fn from pretrain_gpt.py
             model.set_batch_fn(model.module._megatron_batch_fn)
@@ -507,6 +559,10 @@ def setup_model_and_optimizer(model_provider_func, teacher=False):
         if args.fp16:
             optimizer.reload_model_params()
 
+    # random-LTD requires converting transformer layers
+    if args.random_ltd:
+        model[0] = convert_to_random_ltd(model[0], ParallelTransformerLayer)
+
     return model, optimizer, lr_scheduler
 
 
@@ -744,9 +800,27 @@ def add_to_logging(name):
                               args.consumed_train_samples)
             writer.add_scalar('params-norm/params-norm vs tokens', params_norm,
                               args.consumed_train_tokens)
-        if args.curriculum_learning:
-            writer.add_scalar('curriculum_seqlen', args.curriculum_seqlen,
+        if hasattr(args, 'actual_seq_length'):
+            writer.add_scalar('seqlen/actual_seq_length', args.actual_seq_length,
+                              iteration)
+            writer.add_scalar('seqlen/actual_seq_length vs samples', args.actual_seq_length,
+                              args.consumed_train_samples)
+            writer.add_scalar('seqlen/actual_seq_length vs tokens', args.actual_seq_length,
+                              args.consumed_train_tokens)
+        if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
+            writer.add_scalar('seqlen/curriculum_seqlen', args.curriculum_seqlen,
+                              iteration)
+            writer.add_scalar('seqlen/curriculum_seqlen vs samples', args.curriculum_seqlen,
+                              args.consumed_train_samples)
+            writer.add_scalar('seqlen/curriculum_seqlen vs tokens', args.curriculum_seqlen,
+                              args.consumed_train_tokens)
+        if args.random_ltd:
+            writer.add_scalar('seqlen/random_ltd_reserved_length', args.random_ltd_reserved_length,
                               iteration)
+            writer.add_scalar('seqlen/random_ltd_reserved_length vs samples', args.random_ltd_reserved_length,
+                              args.consumed_train_samples)
+            writer.add_scalar('seqlen/random_ltd_reserved_length vs tokens', args.random_ltd_reserved_length,
+                              args.consumed_train_tokens)
         if args.log_timers_to_tensorboard:
             timers.write(timers_to_log, writer, iteration,
                          normalizer=total_iterations)
@@ -825,7 +899,9 @@ def add_to_logging(name):
     if iteration % args.log_interval == 0:
         elapsed_time = timers('interval-time').elapsed()
         elapsed_time_per_iteration = elapsed_time / total_iterations
-        seq_len = args.curriculum_seqlen if args.curriculum_learning else args.seq_length
+        seq_len = args.seq_length
+        if hasattr(args, 'actual_seq_length'):
+            seq_len = args.actual_seq_length
         hidden_size = args.hidden_size
         num_layers = args.num_layers
         vocab_size = args.padded_vocab_size
@@ -871,8 +947,11 @@ def add_to_logging(name):
             log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad)
         if params_norm is not None:
             log_string += ' params norm: {:.3f} |'.format(params_norm)
-        if args.curriculum_learning:
+        if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
             log_string += ' curriculum seqlen: {:5d} |'.format(args.curriculum_seqlen)
+        if args.random_ltd:
+            log_string += ' random ltd reserved length: {:5d} |'.format(args.random_ltd_reserved_length)
+        log_string += ' actual seqlen: {:5d} |'.format(seq_len)
         log_string += ' number of skipped iterations: {:3d} |'.format(
             total_loss_dict[skipped_iters_key])
         log_string += ' number of nan iterations: {:3d} |'.format(
@@ -915,6 +994,11 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
     # Write args to tensorboard
     write_args_to_tensorboard()
 
+    if args.random_ltd:
+        # random-ltd requires different randomness on each rank
+        import random
+        random.seed(args.seed + torch.distributed.get_rank())
+
     # Turn on training mode which enables dropout.
     for model_module in model:
         model_module.train()
@@ -928,6 +1012,10 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
     timers('interval-time').start()
     print_datetime('before the start of training step')
     report_memory_flag = True
+    if args.random_ltd:
+        assert model[0].random_ltd_enabled()
+        args.random_ltd_layer_num = model[0].random_ltd_scheduler.get_random_ltd_layer_num()
+        
     while iteration < args.train_iters and (args.train_tokens is None or \
         args.consumed_train_tokens < args.train_tokens):
         update_num_microbatches(args.consumed_train_samples)
@@ -938,7 +1026,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
                                 get_num_microbatches()
             model[0].set_train_batch_size(global_batch_size)
 
-        if args.curriculum_learning and not args.no_pipeline_parallel:
+        if args.curriculum_learning_legacy and not args.no_pipeline_parallel:
             args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \
                     args.iteration + 1)
         loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
@@ -953,14 +1041,25 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
                                        args.micro_batch_size * \
                                        get_num_microbatches()
         args.consumed_train_samples += new_samples
-        if not args.custom_token_counting:
-            # Models like BERT have padding thus need special token counting.
-            # See example in ../../pretrain_bert.py.
-            if args.curriculum_learning:
-                args.consumed_train_tokens += new_samples * args.curriculum_seqlen
+        # This actual_seq_length is used for actual consumed tokens calculation, flops calculation, and logging.
+        args.actual_seq_length = args.seq_length
+        if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
+            args.actual_seq_length = args.curriculum_seqlen
+        if args.random_ltd:
+            args.random_ltd_reserved_length = model[0].random_ltd_scheduler.get_current_seq()
+            if args.random_ltd_reserved_length < args.actual_seq_length:
+                args.actual_seq_length = (args.actual_seq_length * (args.num_layers - args.random_ltd_layer_num) + args.random_ltd_reserved_length * args.random_ltd_layer_num) // args.num_layers
+        if args.curriculum_learning_legacy or args.data_efficiency_curriculum_learning:
+            if hasattr(args, 'data_efficiency_curriculum_learning_numel'):
+                act_mbsz = args.data_efficiency_curriculum_learning_numel / args.curriculum_seqlen
+                act_token = act_mbsz * args.actual_seq_length
+                args.consumed_train_tokens += mpu.get_data_parallel_world_size() * \
+                        get_num_microbatches() * act_token
             else:
-                args.consumed_train_tokens += new_samples * args.seq_length
-
+                args.consumed_train_tokens += new_samples * args.actual_seq_length
+        else:
+            args.consumed_train_tokens += new_samples * args.actual_seq_length
+        
         # Logging.
         if args.deepspeed:
             if hasattr(model[0].optimizer, 'cur_scale'):
@@ -1037,7 +1136,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
     for model_module in model:
         model_module.eval()
 
-    if args.curriculum_learning and not args.no_pipeline_parallel:
+    if args.curriculum_learning_legacy and not args.no_pipeline_parallel:
         # When curriculum learning is used with pipeline parallelism, we need
         # this logic to ensure that the eval data is not truncated. If there
         # is a seqlen change due to that, we need to call
@@ -1093,7 +1192,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
     for key in total_loss_dict:
         total_loss_dict[key] /= args.eval_iters * get_num_microbatches()
 
-    if args.curriculum_learning and not args.no_pipeline_parallel:
+    if args.curriculum_learning_legacy and not args.no_pipeline_parallel:
         # roll back to actual curriculum seqlen at the end of eval.
         args.curriculum_seqlen = args.curriculum_scheduler.update_difficulty( \
             args.iteration + 1)
diff --git a/megatron/utils.py b/megatron/utils.py
index 59a0a12f3..99c9438bc 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -231,7 +231,10 @@ def throughput_calculator(model, args, iteration_time, total_iterations):
     # The factor of 4 is when used with activation check-pointing,
     # otherwise it will be 3.
     checkpoint_activations_factor = 4 if args.checkpoint_activations else 3
-    flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * args.seq_length * num_layers * (hidden_size**2)) * (1. + (args.seq_length / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size)))
+    seq_len = args.seq_length
+    if hasattr(args, 'actual_seq_length'):
+        seq_len = args.actual_seq_length
+    flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * seq_len * num_layers * (hidden_size**2)) * (1. + (seq_len / (6. * hidden_size)) + (vocab_size / (16. * num_layers * hidden_size)))
     tflops = flops_per_iteration / (elapsed_time_per_iter * args.world_size * (10**12))
     return samples_per_second, tflops, approx_parameters_in_billions
 
diff --git a/pretrain_bert.py b/pretrain_bert.py
index df5be7a06..c550d27e9 100644
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -17,6 +17,7 @@
 
 from functools import partial
 
+import math
 import torch
 import torch.nn.functional as F
 
@@ -72,6 +73,28 @@ def get_batch(data_iterator):
 
     return tokens, types, sentence_order, loss_mask, lm_labels, padding_mask
 
+def data_post_process(data, data_sampler_state_dict):
+    args = get_args()
+    if args.data_efficiency_curriculum_learning:
+        if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']:
+            effective_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate']
+        else:
+            effective_seqlen = torch.count_nonzero(data['padding_mask'], dim=1)
+            effective_seqlen = torch.max(effective_seqlen).to(torch.cuda.current_device())
+            torch.distributed.all_reduce(effective_seqlen,
+                op=torch.distributed.ReduceOp.MAX,
+                group=mpu.get_data_parallel_group())
+            effective_seqlen = effective_seqlen.item()
+        # Has to be multiple of 8 to enable Tensor Core acceleration
+        if effective_seqlen % 8 != 0:
+            effective_seqlen = math.ceil(effective_seqlen / 8) * 8
+        if effective_seqlen < args.seq_length:
+            data['text'] = data['text'][:, :effective_seqlen].contiguous()
+            data['types'] = data['types'][:, :effective_seqlen].contiguous()
+            data['loss_mask'] = data['loss_mask'][:, :effective_seqlen].contiguous()
+            data['labels'] = data['labels'][:, :effective_seqlen].contiguous()
+            data['padding_mask'] = data['padding_mask'][:, :effective_seqlen].contiguous()
+    return data
 
 def loss_func(loss_mask, sentence_order, output_tensor):
     lm_loss_, sop_logits = output_tensor
@@ -110,10 +133,8 @@ def forward_step(data_iterator, model):
         data_iterator)
     timers('batch-generator').stop()
 
-    effective_train_tokens = torch.count_nonzero(padding_mask)
-    torch.distributed.all_reduce(effective_train_tokens,
-        group=mpu.get_data_parallel_group())
-    args.consumed_train_tokens += effective_train_tokens.item()
+    if args.data_efficiency_curriculum_learning:
+        args.curriculum_seqlen = tokens.size()[1]
 
     if not args.bert_binary_head:
         types = None
@@ -150,4 +171,5 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 if __name__ == "__main__":
 
     pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
-             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
+             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'},
+             data_post_process=data_post_process)
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index f57b479a2..369152c8d 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -16,6 +16,7 @@
 """Pretrain GPT"""
 
 import torch
+import math
 from functools import partial
 from megatron import get_args
 from megatron import print_rank_0
@@ -116,6 +117,30 @@ def get_batch(data_iterator):
 
     return tokens, labels, loss_mask, attention_mask, position_ids
 
+def data_post_process(data, data_sampler_state_dict):
+    args = get_args()
+    if args.data_efficiency_curriculum_learning:
+        if 'seqlen_truncate' in data_sampler_state_dict['current_difficulties']:
+            args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_truncate'
+            current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_truncate']
+            if current_seqlen < args.seq_length:
+                data['text'] = data['text'][:, :(current_seqlen+1)].contiguous()
+        elif 'seqlen_reshape' in data_sampler_state_dict['current_difficulties']:
+            args.data_efficiency_curriculum_learning_seqlen_type = 'seqlen_reshape'
+            current_seqlen = data_sampler_state_dict['current_difficulties']['seqlen_reshape']
+            if current_seqlen < args.seq_length:
+                orig_num_token = torch.numel(data['text'])
+                reshape_len = (data['text'].size()[1] // (current_seqlen+1)) * (current_seqlen+1)
+                data['text'] = torch.cat((data['text'][:, :reshape_len].contiguous().view(-1, current_seqlen+1),
+                    data['text'][:, -(current_seqlen+1):]), 0).contiguous()
+                num_row = math.ceil(orig_num_token / (current_seqlen+1))
+                num_row = min(num_row, data['text'].size()[0])
+                if num_row > 1 and num_row % 2 != 0:
+                    num_row -= 1
+                data['text'] = data['text'][:num_row, :].contiguous()
+        else:
+            args.data_efficiency_curriculum_learning_seqlen_type = None
+    return data
 
 def get_batch_pipe(data):
     """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
@@ -141,7 +166,7 @@ def get_batch_pipe(data):
         args.reset_position_ids,
         args.reset_attention_mask,
         args.eod_mask_loss)
-    if args.curriculum_learning and args.curriculum_seqlen < tokens.size()[1]:
+    if args.curriculum_learning_legacy and args.curriculum_seqlen < tokens.size()[1]:
         # seqlen-based curriculum learning
         # tokens, position_ids, labels, loss_mask have size [batch size, seqlen]
         tokens = tokens[:, :args.curriculum_seqlen].contiguous()
@@ -184,7 +209,7 @@ def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, at
     
     if teacher_model:
         with torch.no_grad():
-            if args.curriculum_learning and args.curriculum_seqlen < args.seq_length:
+            if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length:
                 assert args.curriculum_seqlen is not None
                 curriculum_seqlen = args.curriculum_seqlen
                 tokens = tokens[:, :curriculum_seqlen].contiguous()
@@ -213,17 +238,23 @@ def forward_step(data_iterator, model):
         data_iterator)
     timers('batch-generator').stop()
 
+    if args.data_efficiency_curriculum_learning:
+        args.curriculum_seqlen = tokens.size()[1]
+        if hasattr(args, 'data_efficiency_curriculum_learning_seqlen_type') and \
+            args.data_efficiency_curriculum_learning_seqlen_type == 'seqlen_reshape':
+            args.data_efficiency_curriculum_learning_numel = torch.numel(tokens)
+
     if args.mos or args.kd:
         # The forward func can return either the loss or the logits, depending on whether passing in the labels or not.
         stu_output, *other_losses = model(tokens, position_ids, attention_mask)
-        if args.curriculum_learning and args.curriculum_seqlen < args.seq_length:
+        if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length:
             assert args.curriculum_seqlen is not None
             labels = labels[:, :args.curriculum_seqlen].contiguous()
         output_tensor = mpu.vocab_parallel_cross_entropy(stu_output.contiguous().float(), labels)
     else:
         output_tensor, *other_losses = model(tokens, position_ids, attention_mask,
                                             labels=labels)
-    if args.curriculum_learning and args.curriculum_seqlen < args.seq_length:
+    if args.curriculum_learning_legacy and args.curriculum_seqlen < args.seq_length:
         loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous()
 
     moe_losses = []
@@ -292,4 +323,5 @@ def git_ds_info():
 if __name__ == "__main__":
     git_ds_info()
     pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
-             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
+             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+             data_post_process=data_post_process)
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index e5a852187..ca28e9ee3 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -394,6 +394,7 @@ def tasks_args(parser):
     group.add_argument('--results_path', type=str, default = "./results.json", help='Path to where the results will be stored.')
     group.add_argument('--adaptive_seq_len',  default = False, action='store_true',
                        help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
+    group.add_argument('--num_fewshot', type=int, default = 0, help='Number of few-shot prompts.')
     group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
     return parser
 
@@ -408,7 +409,7 @@ def main():
         # adaptive_seq_len hack #1:
         # CL automatically enables reset_activation_shape() which allows us to change input shapes
         # and it also reshapes the attenion scores in attention_mask_func
-        args.curriculum_learning = 1
+        args.curriculum_learning_legacy = 1
 
     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
     task_dict = tasks.get_task_dict(task_list)
@@ -419,7 +420,7 @@ def main():
 
     tokenizer = get_tokenizer()
     adaptor = EvalHarnessAdaptor(model, tokenizer)
-    results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
+    results = evaluator.evaluate(adaptor, task_dict, False, args.num_fewshot, None)
 
     if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
         print(json.dumps(results, indent=2))

From 515798f809669124fe178a7aa45af8e127f35a4d Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Wed, 14 Dec 2022 11:05:04 -0800
Subject: [PATCH 06/11] fix floating point in script (#101)

---
 .../pretrain/ds_pretrain_bert_336M_run.sh     | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
index 46c6c48b5..f03c65ccf 100644
--- a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
+++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
@@ -17,7 +17,7 @@
 # train_iters_in_million=2
 # ltd_enabled="true"
 # ltd_start=200
-# ltd_step_in_million=1.8
+# ltd_step_in_million=18e-1
 # dropout=0
 # cl_enabled="true"
 # cl_num_metric=2
@@ -28,7 +28,7 @@
 # cl_1st_clustering_type="schedule_based"
 # cl_1st_min=600
 # cl_1st_max=9069
-# cl_1st_total_step_in_million=0.96
+# cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
 # cl_2nd_metric="seqlen_truncate"
@@ -38,7 +38,7 @@
 # cl_2nd_clustering_type="single_cluster"
 # cl_2nd_min=128
 # cl_2nd_max=512
-# cl_2nd_total_step_in_million=0.96
+# cl_2nd_total_step_in_million=96e-2
 # cl_2nd_difficulty_step=8
 # cl_2nd_root=1
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
@@ -55,10 +55,10 @@
 ### Random layerwise token dropping (random-LTD).
 ## random-LTD 723B tokens (69%):
 # lr=1.45e-4
-# train_iters_in_million=1.38
+# train_iters_in_million=138e-2
 # ltd_enabled="true"
 # ltd_start=200
-# ltd_step_in_million=1.8
+# ltd_step_in_million=18e-1
 # dropout=0
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
 #     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout}
@@ -66,11 +66,11 @@
 ### Curriculum learning (CL).
 ## CL vocab rarity 734B tokens (70%):
 # lr=1.4e-4
-# train_iters_in_million=1.4
+# train_iters_in_million=14e-1
 # ltd_enabled="false"
 # ltd_start=512
 # ltd_step_in_million=1
-# dropout=0.1
+# dropout=1e-1
 # cl_enabled="true"
 # cl_num_metric=1
 # cl_1st_metric="voc"
@@ -80,7 +80,7 @@
 # cl_1st_clustering_type="schedule_based"
 # cl_1st_min=600
 # cl_1st_max=9069
-# cl_1st_total_step_in_million=0.7
+# cl_1st_total_step_in_million=7e-1
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
@@ -97,7 +97,7 @@
 # ltd_enabled="false"
 # ltd_start=512
 # ltd_step_in_million=1
-# dropout=0.1
+# dropout=1e-1
 # cl_enabled="true"
 # cl_num_metric=2
 # cl_1st_metric="voc"
@@ -107,7 +107,7 @@
 # cl_1st_clustering_type="schedule_based"
 # cl_1st_min=600
 # cl_1st_max=9069
-# cl_1st_total_step_in_million=0.96
+# cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
 # cl_2nd_metric="seqlen_truncate"
@@ -117,7 +117,7 @@
 # cl_2nd_clustering_type="single_cluster"
 # cl_2nd_min=128
 # cl_2nd_max=512
-# cl_2nd_total_step_in_million=0.96
+# cl_2nd_total_step_in_million=96e-2
 # cl_2nd_difficulty_step=8
 # cl_2nd_root=1
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
@@ -137,7 +137,7 @@
 # ltd_enabled="false"
 # ltd_start=512
 # ltd_step_in_million=1
-# dropout=0.1
+# dropout=1e-1
 # cl_enabled="true"
 # cl_num_metric=1
 # cl_1st_metric="seqlenvocabrarity"
@@ -147,7 +147,7 @@
 # cl_1st_clustering_type="schedule_based"
 # cl_1st_min=5
 # cl_1st_max=100
-# cl_1st_total_step_in_million=0.96
+# cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
@@ -164,7 +164,7 @@
 # ltd_enabled="false"
 # ltd_start=512
 # ltd_step_in_million=1
-# dropout=0.1
+# dropout=1e-1
 # cl_enabled="true"
 # cl_num_metric=1
 # cl_1st_metric="voc"
@@ -174,7 +174,7 @@
 # cl_1st_clustering_type="schedule_based"
 # cl_1st_min=600
 # cl_1st_max=9069
-# cl_1st_total_step_in_million=0.96
+# cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
@@ -191,7 +191,7 @@
 # ltd_enabled="false"
 # ltd_start=512
 # ltd_step_in_million=1
-# dropout=0.1
+# dropout=1e-1
 # cl_enabled="true"
 # cl_num_metric=1
 # cl_1st_metric="seqlen_truncate"
@@ -201,7 +201,7 @@
 # cl_1st_clustering_type="single_cluster"
 # cl_1st_min=128
 # cl_1st_max=512
-# cl_1st_total_step_in_million=0.96
+# cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=8
 # cl_1st_root=1
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
@@ -218,7 +218,7 @@
 # ltd_enabled="false"
 # ltd_start=512
 # ltd_step_in_million=1
-# dropout=0.1
+# dropout=1e-1
 # cl_enabled="true"
 # cl_num_metric=1
 # cl_1st_metric="seqlen"
@@ -228,7 +228,7 @@
 # cl_1st_clustering_type="single_cluster"
 # cl_1st_min=5
 # cl_1st_max=100
-# cl_1st_total_step_in_million=0.96
+# cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=8
 # cl_1st_root=2
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \

From 9a52a2e4720a437af557bfccb207616b2b1217e2 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <mrwyattii@gmail.com>
Date: Mon, 19 Dec 2022 09:56:03 -0800
Subject: [PATCH 07/11] added unit test for megatron (#102)

---
 MANIFEST.in            |   1 +
 tests/run_megatron.py  | 113 +++++++++++++++++++++++++++++++++++++++++
 tests/test_megatron.py |  61 ++++++++++++++++++++++
 3 files changed, 175 insertions(+)
 create mode 100644 tests/run_megatron.py
 create mode 100644 tests/test_megatron.py

diff --git a/MANIFEST.in b/MANIFEST.in
index f44791183..d6ca373a7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,3 @@
 include megatron/data/Makefile
 include megatron/data/helpers.cpp
+recursive-include megatron/fused_kernels *.cpp *.h *.cu *.tr *.cuh *.cc
diff --git a/tests/run_megatron.py b/tests/run_megatron.py
new file mode 100644
index 000000000..ad96165b5
--- /dev/null
+++ b/tests/run_megatron.py
@@ -0,0 +1,113 @@
+import torch
+import deepspeed
+import megatron
+from megatron import get_args
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.initialize import initialize_megatron
+from megatron.model import GPTModel
+from megatron.training import get_model
+from megatron.text_generation_utils import generate_samples_eval
+
+
+def model_provider(pre_process=True, post_process=True):
+    model = GPTModel(
+        num_tokentypes=0,
+        parallel_output=False,
+        pre_process=pre_process,
+        post_process=post_process,
+        return_moe_loss=False,
+    )
+    return model
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title="text generation")
+
+    group.add_argument(
+        "--temperature", type=float, default=1.0, help="Sampling temperature."
+    )
+    group.add_argument(
+        "--greedy", action="store_true", default=False, help="Use greedy sampling."
+    )
+    group.add_argument("--top_p", type=float, default=0.0, help="Top p sampling.")
+    group.add_argument("--top_k", type=int, default=0, help="Top k sampling.")
+    group.add_argument(
+        "--out-seq-length",
+        type=int,
+        default=1024,
+        help="Size of the output generated text.",
+    )
+    group.add_argument(
+        "--sample-input-file",
+        type=str,
+        default=None,
+        help="Get input from file instead of interactive mode, "
+        "each line is an input.",
+    )
+    group.add_argument(
+        "--sample-output-file",
+        type=str,
+        default=None,
+        help="Output file got from --sample-input-file",
+    )
+    group.add_argument(
+        "--num-samples",
+        type=int,
+        default=0,
+        help="Number of samples to generate unconditionally, "
+        "defaults to 0 and interactive conditional sampling",
+    )
+    group.add_argument(
+        "--genfile", type=str, help="Output file when generating unconditionally"
+    )
+    group.add_argument(
+        "--recompute",
+        action="store_true",
+        help="During generation recompute all attention "
+        "instead of using previously computed keys/values.",
+    )
+    group.add_argument(
+        "--context-tokens", type=str, default="DeepSpeed is the greatest"
+    )
+    group.add_argument("--max-tokens", type=int, default=50)
+
+    return parser
+
+
+if __name__ == "__main__":
+    # initialize megatron
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            "tokenizer_type": "GPT2BPETokenizer",
+            "no_load_rng": True,
+            "no_load_optim": True,
+        },
+    )
+    args = get_args()
+
+    # setup model
+    model = get_model(model_provider)
+    _ = load_checkpoint(model, None, None)
+    model = model[0]
+    if args.ds_inference:
+        engine = deepspeed.init_inference(
+            model=model,
+            mp_size=args.tensor_model_parallel_size,
+            tensor_parallel={"mpu": mpu},
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            moe_experts=args.num_experts,
+            moe_type=args.mlp_type,
+        )
+        model = engine.module
+
+    # generate output
+    generate_samples_eval(
+        model, args.context_tokens, 1, 0
+    )  # Just so we don't get log output from DeepSpeed (this should be removed once we improve logging in DeepSpeed)
+    print("===START OUTPUT===")
+    print(generate_samples_eval(model, args.context_tokens, args.max_tokens, 0))
+    print("===END OUTPUT===")
diff --git a/tests/test_megatron.py b/tests/test_megatron.py
new file mode 100644
index 000000000..e7342c244
--- /dev/null
+++ b/tests/test_megatron.py
@@ -0,0 +1,61 @@
+import pytest
+import os
+import re
+import subprocess
+
+
+@pytest.fixture(params=[1])
+def moe_num_experts(request):
+    return str(request.param)
+
+
+@pytest.fixture(params=[1])
+def mp_size(request):
+    return str(request.param)
+
+
+@pytest.fixture
+def params(moe_num_experts, mp_size):
+    base_dir = os.getenv("MEGATRON_CKPT_DIR")
+    assert base_dir, "Please set MEGATRON_CKPT_DIR in your environment"
+
+    vocab_file = os.path.join(base_dir, "gpt2-vocab.json")
+    merge_file = os.path.join(base_dir, "gpt2-merges.txt")
+    ckpt_path = os.path.join(base_dir, "checkpoints/gpt2_345m")
+
+    return [
+        "--micro-batch-size", "1",
+        "--num-layers", "24",
+        "--hidden-size", "1024",
+        "--num-attention-heads", "16",
+        "--max-position-embeddings", "1024",
+        "--vocab-file", vocab_file,
+        "--merge-file", merge_file,
+        "--load", ckpt_path,
+        "--seq-length", "1024",
+        "--out-seq-length", "1024",
+        "--tensor-model-parallel-size", mp_size,
+        "--tokenizer-type", "GPT2BPETokenizer",
+        "--num-experts", moe_num_experts,
+        "--mlp-type", "standard",
+        "--num-samples", "0",
+        "--fp16",
+    ]
+
+
+def test_moe_megatron(params, mp_size):
+    output_re = r"===START OUTPUT===([\S\s]*)===END OUTPUT==="
+
+    # Run the baseline
+    baseline_cmd = ["deepspeed", "--num_gpus", mp_size, "./run_megatron.py"] + params
+    result = subprocess.run(baseline_cmd, stdout=subprocess.PIPE)
+    baseline_output = re.search(output_re, result.stdout.decode("utf-8")).group(1)
+
+    # Run with DeepSpeed
+    deepspeed_cmd = baseline_cmd + ["--ds-inference"]
+    result = subprocess.run(deepspeed_cmd, stdout=subprocess.PIPE)
+    deepspeed_output = re.search(output_re, result.stdout.decode("utf-8")).group(1)
+
+    assert (
+        baseline_output == deepspeed_output
+    ), f"outputs do not match: {baseline_output}\n{deepspeed_output}"

From 789f2a9047d565e47c53a813d296775387eb961f Mon Sep 17 00:00:00 2001
From: Michael Wyatt <mrwyattii@gmail.com>
Date: Tue, 20 Dec 2022 15:42:47 -0800
Subject: [PATCH 08/11] fix deprecated numpy types (#103)

---
 megatron/data/indexed_dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index 7f1379671..20344f889 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -98,8 +98,8 @@ def write_longs(f, a):
     3: np.int16,
     4: np.int32,
     5: np.int64,
-    6: np.float,
-    7: np.double,
+    6: np.float32,
+    7: np.float64,
     8: np.uint16
 }
 
@@ -271,8 +271,8 @@ class IndexedDatasetBuilder(object):
         np.int16: 2,
         np.int32: 4,
         np.int64: 8,
-        np.float: 4,
-        np.double: 8
+        np.float32: 4,
+        np.float64: 8
     }
 
     def __init__(self, out_file, dtype=np.int32):

From c24020485dc9199995c8eb78fdb1c743a4605441 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Fri, 23 Dec 2022 12:59:24 -0800
Subject: [PATCH 09/11] fix script typo

---
 .../bert/pretrain/ds_pretrain_bert_336M_base_script.sh        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh
index 551ca3118..b5a59f324 100644
--- a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh
+++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_base_script.sh
@@ -300,7 +300,7 @@ data_options=" \
 megatron_options=" \
     --override-lr-scheduler \
     --adam-beta1 0.9 \
-    --adam-beta2 0.95 \
+    --adam-beta2 0.999 \
     --tensor-model-parallel-size ${mp_size} \
     --init-method-std ${init_std} \
     --lr-decay-tokens ${lr_decay_tokens} \
@@ -469,4 +469,4 @@ if [[ $iteration -gt 0 ]]; then
     ds_ssh "echo $iteration_2 > $iteration_file_2"
 fi
 
-deepspeed ${dir}/../../../../pretrain_bert.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log
\ No newline at end of file
+deepspeed ${dir}/../../../../pretrain_bert.py ${megatron_options} ${data_options} ${deepspeed_options} &>> ${log_path}/${jobname}_${host}_${current_time}.log

From 178beeb45d8f89c926366659900934351aa007e6 Mon Sep 17 00:00:00 2001
From: mzl <mingzhi.liu@intel.com>
Date: Mon, 30 Jan 2023 23:12:06 +0800
Subject: [PATCH 10/11] Use accelerator for supporting other device (#104)

* xpu support (#55)

* port accel abs interfece

* WA for run3.6b

* move on

* fix current_dievice

* fix typo

* enable to run 345M GPT

* delete apex_patch

* add TODO xpu compatible tg for xpu WA

* use deepspeed launcher

* enable run3.6b bf16

* add zero2 config json

* readd enable_each_rank_log

* fix typos

* add ccl arg

* fix

* use short word

* use no-masked-softmax-fusion

* readd

* set train  iters to 10

* remove duplicate line

* change assert msg

* update format

* add whitespace

* update path

* update note

* update

* fix typos

* delete notes

* update format

* update xpu check to cuda check

* update

* clean up file

* fix typos

* add python based gradient clipping

* change condition for python based path
---
 megatron/__init__.py                     |  4 +-
 megatron/arguments.py                    |  2 +-
 megatron/checkpointing.py                |  6 +--
 megatron/data/biencoder_dataset_utils.py |  4 +-
 megatron/data/dataset_utils.py           |  6 +--
 megatron/data/gpt_dataset.py             |  4 +-
 megatron/data/realm_dataset_utils.py     |  4 +-
 megatron/global_vars.py                  |  6 +--
 megatron/initialize.py                   | 22 +++++-----
 megatron/memory.py                       |  4 +-
 megatron/model/__init__.py               |  8 ++--
 megatron/model/distributed.py            |  4 +-
 megatron/model/fused_softmax.py          |  1 -
 megatron/model/module.py                 |  8 ++--
 megatron/model/realm_model.py            |  6 +--
 megatron/model/transformer.py            |  3 +-
 megatron/model/vit_model.py              |  3 +-
 megatron/mpu/data.py                     |  8 ++--
 megatron/mpu/layers.py                   | 12 +++---
 megatron/mpu/random.py                   | 35 ++++++++--------
 megatron/mpu/tests/commons.py            |  6 +--
 megatron/mpu/tests/test_cross_entropy.py |  9 +++--
 megatron/mpu/tests/test_data.py          |  3 +-
 megatron/mpu/tests/test_layers.py        | 48 +++++++++++-----------
 megatron/mpu/tests/test_random.py        | 22 +++++-----
 megatron/optimizer/__init__.py           |  9 ++++-
 megatron/optimizer/clip_grads.py         | 51 ++++++++++++++----------
 megatron/optimizer/grad_scaler.py        | 12 +++---
 megatron/optimizer/optimizer.py          | 34 +++++++++-------
 megatron/p2p_communication.py            |  8 ++--
 megatron/text_generation_utils.py        | 39 +++++++++---------
 megatron/training.py                     | 33 +++++++--------
 megatron/utils.py                        | 35 +++++++++-------
 pretrain_gpt.py                          |  3 +-
 pretrain_ict.py                          | 10 ++---
 pretrain_vit.py                          |  4 +-
 tasks/eval_harness/evaluate.py           |  2 +-
 tasks/eval_utils.py                      |  3 +-
 tasks/finetune_utils.py                  | 10 ++---
 tasks/orqa/evaluate_utils.py             | 12 +++---
 tasks/orqa/natural_questions/nq.py       |  9 +++--
 tasks/vision/eval_utils.py               |  4 +-
 tasks/vision/finetune_utils.py           |  6 +--
 tasks/zeroshot_gpt/evaluate.py           |  6 +--
 44 files changed, 282 insertions(+), 246 deletions(-)

diff --git a/megatron/__init__.py b/megatron/__init__.py
index c5b9b3af6..93894cd3f 100644
--- a/megatron/__init__.py
+++ b/megatron/__init__.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import os
 import torch
-
+from deepspeed.accelerator import get_accelerator
 from .package_info import (
     __description__,
     __contact_names__,
@@ -64,7 +64,7 @@ def is_rank_0():
     """Check whether it is rank 0. For AML, check if it is rank 0 of a node"""
     if torch.distributed.is_initialized():
         if torch.distributed.get_rank() == 0 or (
-            is_aml() and torch.distributed.get_rank() % torch.cuda.device_count() == 0
+            is_aml() and torch.distributed.get_rank() % get_accelerator().device_count() == 0
             ):
             return True
         else:
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 4c4e00dcd..bdd17450b 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -653,7 +653,7 @@ def _add_distributed_args(parser):
     group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
                        help='Number of layers per virtual pipeline stage')
     group.add_argument('--distributed-backend', default='nccl',
-                       choices=['nccl', 'gloo'],
+                       choices=['nccl', 'gloo', 'ccl'],
                        help='Which backend to use for distributed training.')
     group.add_argument('--DDP-impl', default='local',
                        choices=['local', 'torch'],
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 52c2a84da..fbef9da00 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -19,7 +19,7 @@
 import random
 import sys
 import numpy as np
-
+from deepspeed.accelerator import get_accelerator
 import torch
 
 from megatron import (get_args,
@@ -150,7 +150,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
             state_dict['random_rng_state'] = random.getstate()
             state_dict['np_rng_state'] = np.random.get_state()
             state_dict['torch_rng_state'] = torch.get_rng_state()
-            state_dict['cuda_rng_state'] = torch.cuda.get_rng_state()
+            state_dict['cuda_rng_state'] = get_accelerator().get_rng_state()
             state_dict['rng_tracker_states'] \
                 = mpu.get_cuda_rng_tracker().get_states()
 
@@ -417,7 +417,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
             random.setstate(state_dict['random_rng_state'])
             np.random.set_state(state_dict['np_rng_state'])
             torch.set_rng_state(state_dict['torch_rng_state'])
-            torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
+            get_accelerator().set_rng_state(state_dict['cuda_rng_state'])
             # Check for empty states array
             if not state_dict['rng_tracker_states']:
                 raise KeyError
diff --git a/megatron/data/biencoder_dataset_utils.py b/megatron/data/biencoder_dataset_utils.py
index f7b3b961b..e22cc7e89 100644
--- a/megatron/data/biencoder_dataset_utils.py
+++ b/megatron/data/biencoder_dataset_utils.py
@@ -8,7 +8,7 @@
 from megatron.data.dataset_utils import create_masked_lm_predictions, \
                                             pad_and_convert_to_numpy
 from megatron.data.data_samplers import MegatronPretrainingSampler
-
+from deepspeed.accelerator import get_accelerator
 def make_attention_mask(source_block, target_block):
     """
     Returns a 2-dimensional (2-D) attention mask
@@ -187,7 +187,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
     # parallel case
-    counts = torch.cuda.LongTensor([1])
+    counts = get_accelerator().LongTensor([1])
     torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
     assert counts[0].item() == torch.distributed.get_world_size(
         group=mpu.get_data_parallel_group())
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index cf8ccb9fd..8cb7f85eb 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -33,7 +33,7 @@
 )
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-
+from deepspeed.accelerator import get_accelerator
 DSET_TYPE_BERT = 'standard_bert'
 DSET_TYPE_ICT = 'ict'
 DSET_TYPE_T5  = 't5'
@@ -711,8 +711,8 @@ def get_samples_mapping(indexed_dataset,
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
     # parallel case
-    if torch.cuda.device_count() > 0: # Skip when CPU-only
-        counts = torch.cuda.LongTensor([1])
+    if get_accelerator().device_count() > 0: # Skip when CPU-only
+        counts = get_accelerator().LongTensor([1])
         torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
         torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
         assert counts[0].item() == (
diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 5625f4586..a14179bb9 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 import torch
-
+from deepspeed.accelerator import get_accelerator
 from megatron import mpu, is_rank_0, print_rank_0, get_args
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
@@ -313,7 +313,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
     # parallel case
-    counts = torch.cuda.LongTensor([1])
+    counts = get_accelerator().LongTensor([1])
     torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
     torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group())
     assert counts[0].item() == (
diff --git a/megatron/data/realm_dataset_utils.py b/megatron/data/realm_dataset_utils.py
index aecf5549a..b7d4a40c6 100644
--- a/megatron/data/realm_dataset_utils.py
+++ b/megatron/data/realm_dataset_utils.py
@@ -7,7 +7,7 @@
 from megatron import mpu, print_rank_0
 from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy
 from megatron import get_args, get_tokenizer, print_rank_0, mpu
-
+from deepspeed.accelerator import get_accelerator
 
 def get_one_epoch_dataloader(dataset, micro_batch_size=None):
     """Specifically one epoch to be used in an indexing job."""
@@ -177,7 +177,7 @@ def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epo
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
     # parallel case
-    counts = torch.cuda.LongTensor([1])
+    counts = get_accelerator().LongTensor([1])
     torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group())
     assert counts[0].item() == torch.distributed.get_world_size(
         group=mpu.get_data_parallel_group())
diff --git a/megatron/global_vars.py b/megatron/global_vars.py
index c486f0d6b..21b647dac 100644
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -24,7 +24,7 @@
 from megatron.tokenizer import build_tokenizer
 from .arguments import parse_args
 from .microbatches import build_num_microbatches_calculator
-
+from deepspeed.accelerator import get_accelerator
 _GLOBAL_ARGS = None
 _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
 _GLOBAL_TOKENIZER = None
@@ -192,14 +192,14 @@ def __init__(self, name):
     def start(self):
         """Start the timer."""
         assert not self.started_, 'timer has already been started'
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         self.start_time = time.time()
         self.started_ = True
 
     def stop(self):
         """Stop the timer."""
         assert self.started_, 'timer is not started'
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         self.elapsed_ += (time.time() - self.start_time)
         self.started_ = False
 
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 5ed7dc089..4b7e11a6c 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -21,7 +21,6 @@
 
 import numpy as np
 import torch
-
 from megatron import fused_kernels
 from megatron import get_adlr_autoresume
 from megatron import get_args
@@ -30,7 +29,7 @@
 from megatron.global_vars import set_global_variables
 from megatron.mpu import (set_tensor_model_parallel_rank,
                           set_tensor_model_parallel_world_size)
-
+from deepspeed.accelerator import get_accelerator
 import deepspeed
 import deepspeed.utils.groups as groups
 
@@ -46,7 +45,7 @@ def initialize_megatron(extra_args_provider=None, args_defaults={},
     """
     if not allow_no_cuda:
         # Make sure cuda is available.
-        assert torch.cuda.is_available(), 'Megatron requires CUDA.'
+        assert get_accelerator().is_available(), 'Megatron requires accelerator.'
 
     # Parse args, build tokenizer, and set adlr-autoresume,
     # tensorboard-writer, and timers.
@@ -107,7 +106,10 @@ def _compile_dependencies():
         compile_helper()
         print('>>> done with dataset index builder. Compilation time: {:.3f} '
               'seconds'.format(time.time() - start_time), flush=True)
-
+        
+    if not get_accelerator().device_name() == 'cuda':
+        print(">fused kernel is only supported in cuda, skip loading fused kernel")
+        return 
     # ==================
     # Load fused kernels
     # ==================
@@ -134,7 +136,7 @@ def _compile_dependencies():
     if _is_rank_0():
         start_time = time.time()
         print('> compiling and loading fused kernels ...', flush=True)
-        if torch.cuda.device_count() > 0: # Skip when CPU-only
+        if get_accelerator().device_count() > 0: # Skip when CPU-only
             fused_kernels.load(args)
         torch.distributed.barrier()
     else:
@@ -185,7 +187,7 @@ def setup_deepspeed_random_and_activation_checkpointing(args):
 def _initialize_distributed():
     """Initialize torch.distributed and mpu."""
     args = get_args()
-    device_count = torch.cuda.device_count()
+    device_count = get_accelerator().device_count()
     if torch.distributed.is_initialized():
 
         if args.rank == 0:
@@ -206,7 +208,7 @@ def _initialize_distributed():
             else:
                 args.local_rank = device
 
-            torch.cuda.set_device(device) # only do so when device_count > 0
+            get_accelerator().set_device(device) # only do so when device_count > 0
 
         # Call the init process
         init_method = 'tcp://'
@@ -249,14 +251,14 @@ def _set_random_seed(seed_):
     if seed_ is not None and seed_ > 0:
         # Ensure that different pipeline MP stages get different seeds.
         # No need to do so for CPU-only case.
-        if torch.cuda.device_count() == 0:
+        if get_accelerator().device_count() == 0:
             seed = seed_
         else:
             seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
         random.seed(seed)
         np.random.seed(seed)
         torch.manual_seed(seed)
-        if torch.cuda.device_count() > 0:
+        if get_accelerator().device_count() > 0:
             mpu.model_parallel_cuda_manual_seed(seed)
     else:
         raise ValueError('Seed ({}) should be a positive integer.'.format(seed))
@@ -284,7 +286,7 @@ def _is_rank_0():
     """Check whether it is rank 0. For AML, check if it is rank 0 of a node"""
     if torch.distributed.is_initialized():
         if torch.distributed.get_rank() == 0 or (
-            'AZUREML_EXPERIMENT_ID' in os.environ and torch.distributed.get_rank() % torch.cuda.device_count() == 0
+            'AZUREML_EXPERIMENT_ID' in os.environ and torch.distributed.get_rank() % get_accelerator().device_count() == 0
             ):
             return True
         else:
diff --git a/megatron/memory.py b/megatron/memory.py
index be5a117bc..3b06d60d4 100644
--- a/megatron/memory.py
+++ b/megatron/memory.py
@@ -15,7 +15,7 @@
 
 
 import torch
-
+from deepspeed.accelerator import get_accelerator
 
 # A dictionary of all the memory buffers allocated.
 _MEM_BUFFS = dict()
@@ -58,7 +58,7 @@ def __init__(self, name, numel, dtype, track_usage):
         self.dtype = dtype
         self.data = torch.empty(self.numel,
                                 dtype=self.dtype,
-                                device=torch.cuda.current_device(),
+                                device=get_accelerator().current_device_name(),
                                 requires_grad=False)
 
         # Index tracking the start of the free memory.
diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index baf54e455..a2f3be016 100644
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -12,9 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
-
+from deepspeed.accelerator.real_accelerator import get_accelerator
+if get_accelerator().device_name() == 'cuda':
+    from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+else:
+    from torch.nn import LayerNorm
 from .distributed import DistributedDataParallel
 from .bert_model import BertModel
 from .gpt_model import GPTModel, GPTModelPipe
diff --git a/megatron/model/distributed.py b/megatron/model/distributed.py
index 53d336223..ad86345c7 100644
--- a/megatron/model/distributed.py
+++ b/megatron/model/distributed.py
@@ -22,7 +22,7 @@
 from megatron import get_args
 from megatron import mpu
 from .module import MegatronModule
-
+from deepspeed.accelerator import get_accelerator
 
 
 class MemoryBuffer:
@@ -32,7 +32,7 @@ def __init__(self, numel, dtype):
         self.dtype = dtype
         self.data = torch.zeros(self.numel,
                                 dtype=self.dtype,
-                                device=torch.cuda.current_device(),
+                                device=get_accelerator().current_device_name(),
                                 requires_grad=False)
 
 
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 097b29ef4..8d52967d9 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -154,7 +154,6 @@ def forward(self, input, mask):
                 input = input * self.scale
             mask_output = self.mask_func(input, mask) if mask is not None else input
             probs = torch.nn.Softmax(dim=-1)(mask_output)
-
             if self.input_in_float16 and self.softmax_in_fp32:
                 if self.input_in_fp16:
                     probs = probs.half()
diff --git a/megatron/model/module.py b/megatron/model/module.py
index df92d95a9..9f91c8bd1 100644
--- a/megatron/model/module.py
+++ b/megatron/model/module.py
@@ -18,14 +18,14 @@
 import torch
 from torch.autograd import Variable
 from torch.nn.parameter import Parameter
-
+from deepspeed.accelerator import get_accelerator
 from megatron import get_args
 from megatron import mpu
 
 
-_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
-_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
-_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+_FLOAT_TYPES = (torch.FloatTensor, get_accelerator().FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, get_accelerator().HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, get_accelerator().BFloat16Tensor)
 
 
 
diff --git a/megatron/model/realm_model.py b/megatron/model/realm_model.py
index 5730a85e3..e74eb2e58 100644
--- a/megatron/model/realm_model.py
+++ b/megatron/model/realm_model.py
@@ -12,7 +12,7 @@
 from megatron.model.language_model import get_language_model
 from megatron.model.utils import scaled_init_method_normal
 from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids
-
+from deepspeed.accelerator import get_accelerator
 
 def general_ict_model_provider(only_query_model=False, only_block_model=False):
     """Build the model."""
@@ -72,7 +72,7 @@ def forward(self, query_tokens, query_attention_mask, block_tokens, block_attent
     def embed_query(self, query_tokens, query_attention_mask):
         """Embed a batch of tokens using the query model"""
         if self.use_query_model:
-            query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
+            query_types = get_accelerator().LongTensor(*query_tokens.shape).fill_(0)
             query_ict_logits, _ = self.query_model.forward(query_tokens, query_attention_mask, query_types)
             return query_ict_logits
         else:
@@ -81,7 +81,7 @@ def embed_query(self, query_tokens, query_attention_mask):
     def embed_block(self, block_tokens, block_attention_mask):
         """Embed a batch of tokens using the block model"""
         if self.use_block_model:
-            block_types = torch.cuda.LongTensor(*block_tokens.shape).fill_(0)
+            block_types = get_accelerator().LongTensor(*block_tokens.shape).fill_(0)
             block_ict_logits, _ = self.block_model.forward(block_tokens, block_attention_mask, block_types)
             return block_ict_logits
         else:
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 24b46d5c7..692eac3d9 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -29,6 +29,7 @@
 from torch import distributed as dist
 import deepspeed
 from deepspeed.moe.layer import MoE
+from deepspeed.accelerator import get_accelerator
 # flags required to enable jit fusion kernels
 torch._C._jit_set_profiling_mode(False)
 torch._C._jit_set_profiling_executor(False)
@@ -275,7 +276,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None,
             output_size[2],
             output_size[3],
             dtype=query_layer.dtype,
-            device=torch.cuda.current_device())
+            device=get_accelerator().current_device_name())
 
         # Raw attention scores. [b * np, sq, sk]
         matmul_result = torch.baddbmm(
diff --git a/megatron/model/vit_model.py b/megatron/model/vit_model.py
index 84a52a829..ae3d56281 100644
--- a/megatron/model/vit_model.py
+++ b/megatron/model/vit_model.py
@@ -27,6 +27,7 @@
     scaled_init_method_normal,
 )
 from .module import MegatronModule
+from deepspeed.accelerator import get_accelerator
 
 
 class VitMlpHead(MegatronModule):
@@ -164,7 +165,7 @@ def __init__(self, num_classes, finetune=False):
         init_method_normal(args.init_method_std)(
             self.position_embeddings.weight
         )
-        self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
+        self.position_ids = torch.arange(self.seq_length).expand(1, -1).to(get_accelerator().device_name())
 
         self.position_embeddings._register_load_state_dict_pre_hook(
             twod_interpolate_position_embeddings_hook
diff --git a/megatron/mpu/data.py b/megatron/mpu/data.py
index dd57a8ffc..679955aa1 100644
--- a/megatron/mpu/data.py
+++ b/megatron/mpu/data.py
@@ -18,7 +18,7 @@
 from .initialize import get_tensor_model_parallel_group
 from .initialize import get_tensor_model_parallel_rank
 from .initialize import get_tensor_model_parallel_src_rank
-
+from deepspeed.accelerator import get_accelerator
 
 _MAX_DATA_DIM = 5
 
@@ -46,7 +46,7 @@ def _build_key_size_numel_dictionaries(keys, data):
             offset += max_dim
 
     # Move to GPU and broadcast.
-    sizes_cuda = torch.cuda.LongTensor(sizes)
+    sizes_cuda = get_accelerator().LongTensor(sizes)
     torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(),
                                 group=get_tensor_model_parallel_group())
 
@@ -94,10 +94,10 @@ def broadcast_data(keys, data, datatype):
         _check_data_types(keys, data, datatype)
         # Flatten the data associated with the keys
         flatten_data = torch.cat(
-            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+            [data[key].contiguous().view(-1) for key in keys], dim=0).to(get_accelerator().device_name())
     else:
         flatten_data = torch.empty(total_numel,
-                                   device=torch.cuda.current_device(),
+                                   device=get_accelerator().current_device_name(),
                                    dtype=datatype)
 
     # Broadcast
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 0d81d5622..5d168c672 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -37,7 +37,7 @@
 from .utils import VocabUtility
 from megatron import get_args
 import deepspeed.runtime.activation_checkpointing.checkpointing as ds_checkpointing
-
+from deepspeed.accelerator import get_accelerator
 
 _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
                                       'partition_dim': -1,
@@ -176,7 +176,7 @@ def __init__(self, num_embeddings, embedding_dim,
         else:
             self.weight = Parameter(torch.empty(
                 self.num_embeddings_per_partition, self.embedding_dim,
-                device=torch.cuda.current_device(), dtype=args.params_dtype))
+                device=get_accelerator().current_device_name(), dtype=args.params_dtype))
             _initialize_affine_weight_gpu(self.weight, init_method,
                                           partition_dim=0, stride=1)
 
@@ -264,7 +264,7 @@ def __init__(self, input_size, output_size, bias=True, gather_output=True,
         else:
             self.weight = Parameter(torch.empty(
                 self.output_size_per_partition, self.input_size,
-                device=torch.cuda.current_device(), dtype=args.params_dtype))
+                device=get_accelerator().current_device_name(), dtype=args.params_dtype))
             _initialize_affine_weight_gpu(self.weight, init_method,
                                           partition_dim=0, stride=stride)
             
@@ -275,7 +275,7 @@ def __init__(self, input_size, output_size, bias=True, gather_output=True,
             else:
                 self.bias = Parameter(torch.empty(
                     self.output_size_per_partition,
-                    device=torch.cuda.current_device(),
+                    device=get_accelerator().current_device_name(),
                     dtype=args.params_dtype))
             set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
             # Always initialize bias to zero.
@@ -375,7 +375,7 @@ def __init__(self, input_size, output_size, bias=True,
         else:
             self.weight = Parameter(torch.empty(
                 self.output_size, self.input_size_per_partition,
-                device=torch.cuda.current_device(), dtype=args.params_dtype))
+                device=get_accelerator().current_device_name(), dtype=args.params_dtype))
             _initialize_affine_weight_gpu(self.weight, init_method,
                                           partition_dim=1, stride=stride)
         if bias:
@@ -384,7 +384,7 @@ def __init__(self, input_size, output_size, bias=True,
                                                   dtype=args.params_dtype))
             else:
                 self.bias = Parameter(torch.empty(
-                    self.output_size, device=torch.cuda.current_device(),
+                    self.output_size, device=get_accelerator().current_device_name(),
                     dtype=args.params_dtype))
             # Always initialize bias to zero.
             with torch.no_grad():
diff --git a/megatron/mpu/random.py b/megatron/mpu/random.py
index 2a57316a1..beea791b5 100644
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -21,7 +21,7 @@
 
 import torch
 from torch import _C
-from torch.cuda import _lazy_call, device as device_ctx_manager
+from deepspeed.accelerator import get_accelerator
 from torch.utils.checkpoint import detach_variable
 
 from megatron import get_args
@@ -80,25 +80,26 @@ def _set_cuda_rng_state(new_state, device=-1):
     if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
         # older PyTorch
         def cb():
-            with device_ctx_manager(device):
+            with get_accelerator().device(device):
                 _C._cuda_setRNGState(new_state)
     else:
         # newer PyTorch
         if device == -1:
-            device = torch.device('cuda')
+            device = torch.device(get_accelerator().device_name())
         elif isinstance(device, str):
             device = torch.device(device)
         elif isinstance(device, int):
-            device = torch.device('cuda', device)
+            device = torch.device(get_accelerator().device_name(), device)
 
         def cb():
             idx = device.index
             if idx is None:
-                idx = torch.cuda.current_device()
-            default_generator = torch.cuda.default_generators[idx]
+                idx = get_accelerator().current_device()
+            
+            default_generator = get_accelerator().default_generator(idx)
             default_generator.set_state(new_state)
 
-    _lazy_call(cb)
+    get_accelerator().lazy_call(cb)
 
 
 def split_tensor_into_1d_equal_chunks(tensor):
@@ -116,7 +117,7 @@ def gather_split_1d_tensor(tensor):
     numel = torch.numel(tensor)
     numel_gathered = world_size * numel
     gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
-                           device=torch.cuda.current_device(),
+                           device=get_accelerator().current_device_name(),
                            requires_grad=False)
     chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)]
     torch.distributed.all_gather(chunks, tensor,
@@ -167,10 +168,10 @@ def add(self, name, seed):
         if name in self.states_:
             raise Exception('cuda rng state {} already exists'.format(name))
         # Get the current rng state.
-        orig_rng_state = torch.cuda.get_rng_state()
+        orig_rng_state = get_accelerator().get_rng_state()
         # Set the new state and store it.
-        torch.cuda.manual_seed(seed)
-        self.states_[name] = torch.cuda.get_rng_state()
+        get_accelerator().manual_seed(seed)
+        self.states_[name] = get_accelerator().get_rng_state()
         # Reset rng state to what it was.
         _set_cuda_rng_state(orig_rng_state)
 
@@ -183,7 +184,7 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
             print(name, self.states_)
             raise Exception('cuda rng state {} is not added'.format(name))
         # Store current rng state.
-        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        orig_cuda_rng_state = get_accelerator().get_rng_state()
         # Set rng state to the desired one
         _set_cuda_rng_state(self.states_[name])
         # Do the stuff we wanted to do.
@@ -191,7 +192,7 @@ def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
             yield
         finally:
             # Update the current rng state for later use.
-            self.states_[name] = torch.cuda.get_rng_state()
+            self.states_[name] = get_accelerator().get_rng_state()
             # And set the state to the original state we started with.
             _set_cuda_rng_state(orig_cuda_rng_state)
 
@@ -237,7 +238,7 @@ def model_parallel_cuda_manual_seed(seed):
                   data_parallel_seed), flush=True)
     _CUDA_RNG_STATE_TRACKER.reset()
     # Set the default state.
-    torch.cuda.manual_seed(data_parallel_seed)
+    get_accelerator().manual_seed(data_parallel_seed)
     # and model parallel state.
     _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
                                 tensor_model_parallel_seed)
@@ -256,7 +257,7 @@ def forward(ctx, run_function, *args):
 
         # Copy the rng states.
         ctx.fwd_cpu_rng_state = torch.get_rng_state()
-        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state = get_accelerator().get_rng_state()
         ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
 
         with torch.no_grad():
@@ -288,7 +289,7 @@ def backward(ctx, *args):
 
         # Store the current states.
         bwd_cpu_rng_state = torch.get_rng_state()
-        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state = get_accelerator().get_rng_state()
         bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
 
         # Set the states to what it used to be before the forward pass.
@@ -309,7 +310,7 @@ def backward(ctx, *args):
         if isinstance(outputs, torch.Tensor):
             outputs = (outputs,)
         elif len(outputs) == 2 and isinstance(outputs[1], torch.Tensor) and \
-                torch.equal(outputs[1], torch.tensor(0).cuda()):
+                torch.equal(outputs[1], torch.tensor(0).to(get_accelerator().device_name())):
             # a hacky solution to overcome issue when running old script examples/pretrain_gpt_distributed.sh
             outputs = (outputs[0],)
         torch.autograd.backward(outputs, args)
diff --git a/megatron/mpu/tests/commons.py b/megatron/mpu/tests/commons.py
index 5e7a18672..432e720a0 100644
--- a/megatron/mpu/tests/commons.py
+++ b/megatron/mpu/tests/commons.py
@@ -20,7 +20,7 @@
 import torch
 
 import mpu
-
+from deepspeed.accelerator import get_accelerator
 
 class IdentityLayer(torch.nn.Module):
     def __init__(self, size, scale=1.0):
@@ -56,10 +56,10 @@ def initialize_distributed(backend='nccl'):
           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
 
     # Set the device id.
-    device = rank % torch.cuda.device_count()
+    device = rank % get_accelerator().device_count()
     if local_rank is not None:
         device = local_rank
-    torch.cuda.set_device(device)
+    get_accelerator().set_device(device)
 
     # Call the init process.
     init_method = 'tcp://'
diff --git a/megatron/mpu/tests/test_cross_entropy.py b/megatron/mpu/tests/test_cross_entropy.py
index 46d7ba981..8155e3645 100644
--- a/megatron/mpu/tests/test_cross_entropy.py
+++ b/megatron/mpu/tests/test_cross_entropy.py
@@ -23,6 +23,7 @@
 import torch
 import random
 import sys
+from deepspeed.accelerator import get_accelerator
 sys.path.append("../..")
 
 
@@ -30,9 +31,9 @@ def torch_cross_entropy(batch_size, seq_length, vocab_size,
                         logits_scale, seed):
     set_random_seed(seed)
     identity = IdentityLayer((batch_size, seq_length, vocab_size),
-                             scale=logits_scale).cuda()
+                             scale=logits_scale).to(get_accelerator().device_name())
     logits = identity()
-    target = torch.cuda.LongTensor(
+    target = get_accelerator().LongTensor(
         size=(batch_size, seq_length)).random_(0, vocab_size)
     loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
                            target.view(-1),
@@ -45,10 +46,10 @@ def mpu_cross_entropy(batch_size, seq_length, vocab_size,
                       logits_scale, seed):
     set_random_seed(seed)
     identity = IdentityLayer((batch_size, seq_length, vocab_size),
-                             scale=logits_scale).cuda()
+                             scale=logits_scale).to(get_accelerator().device_name())
     logits = identity()
     logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits)
-    target = torch.cuda.LongTensor(
+    target = get_accelerator().LongTensor(
         size=(batch_size, seq_length)).random_(0, vocab_size)
     loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
     loss.backward()
diff --git a/megatron/mpu/tests/test_data.py b/megatron/mpu/tests/test_data.py
index ae3627703..630b00900 100644
--- a/megatron/mpu/tests/test_data.py
+++ b/megatron/mpu/tests/test_data.py
@@ -15,6 +15,7 @@
 
 from commons import print_separator
 from commons import initialize_distributed
+from deepspeed.accelerator import get_accelerator
 from mpu import data as data_utils
 import mpu
 import torch
@@ -65,7 +66,7 @@ def test_broadcast_data(tensor_model_parallel_size):
 
     data_b = data_utils.broadcast_data(keys, data, torch.int64)
     for key in keys:
-        tensor = data_t[key].cuda()
+        tensor = data_t[key].to(get_accelerator().device_name())
         assert data_b[key].sub(tensor).abs().max() == 0
 
     # Reset groups
diff --git a/megatron/mpu/tests/test_layers.py b/megatron/mpu/tests/test_layers.py
index b12f48509..c5e3a2afc 100644
--- a/megatron/mpu/tests/test_layers.py
+++ b/megatron/mpu/tests/test_layers.py
@@ -25,7 +25,7 @@
 import sys
 sys.path.append("../..")
 
-
+device_name = get_accelerator().device_name()
 def test_parallel_embedding(tensor_model_parallel_size):
 
     if torch.distributed.get_rank() == 0:
@@ -43,11 +43,11 @@ def test_parallel_embedding(tensor_model_parallel_size):
 
     set_random_seed(123)
     input_data = torch.LongTensor(
-        size=(batch_size, seq_length)).random_(0, vocab_size).cuda()
-    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
+        size=(batch_size, seq_length)).random_(0, vocab_size).to(device_name)
+    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).to(device_name)
 
     set_random_seed(seed)
-    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
+    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).to(device_name)
 
     output = embedding_original(input_data)
     loss_original = torch.mul(output, loss_weight).sum()
@@ -55,14 +55,14 @@ def test_parallel_embedding(tensor_model_parallel_size):
 
     set_random_seed(seed)
     embedding_parallel = layers.ParallelEmbedding(
-        vocab_size, hidden_size, init_method=init.normal_).cuda()
+        vocab_size, hidden_size, init_method=init.normal_).to(device_name)
     output = embedding_parallel(input_data)
     loss_parallel = torch.mul(output, loss_weight).sum()
     loss_parallel.backward()
 
     set_random_seed(seed)
     embedding_vocab_parallel = layers.VocabParallelEmbedding(
-        vocab_size, hidden_size, init_method=init.normal_).cuda()
+        vocab_size, hidden_size, init_method=init.normal_).to(device_name)
     output = embedding_vocab_parallel(input_data)
     loss_vocab_parallel = torch.mul(output, loss_weight).sum()
     loss_vocab_parallel.backward()
@@ -200,10 +200,10 @@ def test_column_parallel_linear(tensor_model_parallel_size):
     batch_size = 7
 
     # Network
-    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    identity_layer = IdentityLayer2D(batch_size, input_size).to(device_name)
     linear_layer = mpu.ColumnParallelLinear(
-        input_size, output_size, keep_master_weight_for_test=True).cuda()
-    loss_weight = torch.randn([batch_size, output_size]).cuda()
+        input_size, output_size, keep_master_weight_for_test=True).to(device_name)
+    loss_weight = torch.randn([batch_size, output_size]).to(device_name)
     # Forward
     input_ = identity_layer()
     output = linear_layer(input_)
@@ -214,9 +214,9 @@ def test_column_parallel_linear(tensor_model_parallel_size):
     # Values.
     dLdY = loss_weight
     X = identity_layer.weight
-    A = linear_layer.master_weight.cuda()
+    A = linear_layer.master_weight.to(device_name)
     dLdA = torch.matmul(dLdY.t(), X)
-    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).to(device_name).t(), dLdY).view(-1)
     dLdX = torch.matmul(dLdY, A)
 
     rank = mpu.get_tensor_model_parallel_rank()
@@ -267,10 +267,10 @@ def test_row_parallel_linear(tensor_model_parallel_size):
     batch_size = 7
 
     # Network
-    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    identity_layer = IdentityLayer2D(batch_size, input_size).to(device_name)
     linear_layer = mpu.RowParallelLinear(
-        input_size, output_size, keep_master_weight_for_test=True).cuda()
-    loss_weight = torch.randn([batch_size, output_size]).cuda()
+        input_size, output_size, keep_master_weight_for_test=True).to(device_name)
+    loss_weight = torch.randn([batch_size, output_size]).to(device_name)
     # Forward
     input_ = identity_layer()
     output = linear_layer(input_)
@@ -281,9 +281,9 @@ def test_row_parallel_linear(tensor_model_parallel_size):
     # Values.
     dLdY = loss_weight
     X = identity_layer.weight
-    A = linear_layer.master_weight.cuda()
+    A = linear_layer.master_weight.to(device_name)
     dLdA = torch.matmul(dLdY.t(), X)
-    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).to(device_name).t(), dLdY).view(-1)
     dLdX = torch.matmul(dLdY, A)
 
     rank = mpu.get_tensor_model_parallel_rank()
@@ -340,11 +340,11 @@ def parallel_self_attention(tensor_model_parallel_size, num_att_heads_per_partit
 
     # Network
     identity_layer = IdentityLayer3D(batch_size, sequence_length,
-                                     hidden_size).cuda()
+                                     hidden_size).to(device_name)
     attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
-                                                    dropout_prob).cuda()
-    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
-    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+                                                    dropout_prob).to(device_name)
+    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).to(device_name)
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).to(device_name)
     # Forward
     input_ = identity_layer()
     output = attention_layer(input_, attention_mask)
@@ -426,13 +426,13 @@ def parallel_transformer(tensor_model_parallel_size, num_att_heads_per_partition
 
     # Network
     identity_layer = IdentityLayer3D(batch_size, sequence_length,
-                                     hidden_size).cuda()
+                                     hidden_size).to(device_name)
     transformer_layer = mpu.BertParallelTransformerLayer(
         hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
-        torch.nn.functional.relu, 1.0e-5).cuda()
+        torch.nn.functional.relu, 1.0e-5).to(device_name)
 
-    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
-    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).to(device_name)
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).to(device_name)
     # Forward
     input_ = identity_layer()
     output = transformer_layer(input_, attention_mask)
diff --git a/megatron/mpu/tests/test_random.py b/megatron/mpu/tests/test_random.py
index 9c9c50341..92ec14d2e 100644
--- a/megatron/mpu/tests/test_random.py
+++ b/megatron/mpu/tests/test_random.py
@@ -32,11 +32,11 @@ def test_set_cuda_rng_state(tensor_model_parallel_size):
 
     size = 123
     seed = 1234
-    torch.cuda.manual_seed(1234)
-    tensor = torch.cuda.FloatTensor(size)
+    get_accelerator().manual_seed(1234)
+    tensor = get_accelerator().FloatTensor(size)
 
     # Get the state
-    rng_state = torch.cuda.get_rng_state()
+    rng_state = get_accelerator().get_rng_state()
     rng_state_copy = rng_state.clone()
 
     # Do some stuff.
@@ -45,10 +45,10 @@ def test_set_cuda_rng_state(tensor_model_parallel_size):
     result_1 = tensor.clone()
 
     assert rng_state.sub(rng_state_copy).max() == 0
-    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+    assert get_accelerator().get_rng_state().sub(rng_state_copy).max() > 0
 
     # State should be different.
-    new_rng_state = torch.cuda.get_rng_state()
+    new_rng_state = get_accelerator().get_rng_state()
     max_diff = new_rng_state.sub(rng_state).max()
     print('   max diff in rng state (should be non-zero) on global rank {}: {}'.
           format(torch.distributed.get_rank(), max_diff))
@@ -95,17 +95,17 @@ def test_cuda_rng_tracker(tensor_model_parallel_size):
     seed_1 = 1234
     seed_2 = 4321
     size = [12, 21]
-    tensor = torch.cuda.FloatTensor(size)
+    tensor = get_accelerator().FloatTensor(size)
 
     # Set to seed_1 and generate two tensors.
-    torch.cuda.manual_seed(seed_1)
+    get_accelerator().manual_seed(seed_1)
     torch.randn(size, out=tensor)
     target_11 = tensor.clone()
     torch.randn(size, out=tensor)
     target_12 = tensor.clone()
 
     # Set to seed_2 and generate two tensors.
-    torch.cuda.manual_seed(seed_2)
+    get_accelerator().manual_seed(seed_2)
     torch.randn(size, out=tensor)
     target_21 = tensor.clone()
     torch.randn(size, out=tensor)
@@ -113,7 +113,7 @@ def test_cuda_rng_tracker(tensor_model_parallel_size):
 
     # Now if we interleave seed_1 and seed_2,
     # we should still get the same tensors
-    torch.cuda.manual_seed(seed_1)
+    get_accelerator().manual_seed(seed_1)
     mpu.get_cuda_rng_tracker().add('test', seed_2)
 
     torch.randn(size, out=tensor)
@@ -164,9 +164,9 @@ def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size):
     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
 
     mpu.model_parallel_cuda_manual_seed(12345)
-    assert torch.cuda.initial_seed() == 12345
+    assert get_accelerator().initial_seed() == 12345
     with mpu.get_cuda_rng_tracker().fork():
-        assert torch.cuda.initial_seed() == (12345 + 2718 +
+        assert get_accelerator().initial_seed() == (12345 + 2718 +
                                              mpu.get_tensor_model_parallel_rank())
 
     # Reset the tracker
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
index 68307423c..659d680be 100644
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -12,9 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from deepspeed.accelerator import get_accelerator
+if get_accelerator().device_name() == 'cuda':
+    from apex.optimizers import FusedAdam as Adam
+    from apex.optimizers import FusedSGD as SGD
+else:
+    from torch.optim import Adam
+    from torch.optim import SGD
 
-from apex.optimizers import FusedAdam as Adam
-from apex.optimizers import FusedSGD as SGD
 
 from megatron import get_args
 from megatron.model import LayerNorm
diff --git a/megatron/optimizer/clip_grads.py b/megatron/optimizer/clip_grads.py
index 036a1d4c4..1f26f3868 100644
--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
@@ -17,14 +17,17 @@
 
 import torch
 from torch._six import inf
+from deepspeed.accelerator import get_accelerator
+
+if get_accelerator().device_name() == 'cuda':
+    from apex.multi_tensor_apply import multi_tensor_applier
+    import amp_C
 
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
 
 from megatron import mpu
 from megatron.model.module import param_is_not_shared
 from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
-
+from deepspeed.accelerator import get_accelerator
 
 def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
     """Clips gradient norm of an iterable of parameters whose gradients
@@ -61,7 +64,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
         grad = param.grad.detach()
         if grad_not_none:
             # Make sure the grads are in fp32
-            assert param.grad.type() == 'torch.cuda.FloatTensor'
+            assert param.grad.type() == 'torch.{}.FloatTensor'.format(get_accelerator().device_name())
             grads.append(grad)
         if grad_not_none and is_not_shared and is_not_tp_duplicate:
             grads_for_norm.append(grad)
@@ -74,7 +77,7 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
     # Calculate norm.
     if norm_type == inf:
         total_norm = max(grad.abs().max() for grad in grads_for_norm)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all model-parallel GPUs.
         torch.distributed.all_reduce(total_norm_cuda,
                                      op=torch.distributed.ReduceOp.MAX,
@@ -83,20 +86,22 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
 
     else:
         if norm_type == 2.0:
-            dummy_overflow_buf = torch.cuda.IntTensor([0])
-            # Use apex's multi-tensor applier for efficiency reasons.
-            # Multi-tensor applier takes a function and a list of list
-            # and performs the operation on that list all in one kernel.
-            grad_norm, _ = multi_tensor_applier(
-                amp_C.multi_tensor_l2norm,
-                dummy_overflow_buf,
-                [grads_for_norm],
-                False # no per-parameter norm
-            )
+            if get_accelerator().device_name() == 'cuda':
+                dummy_overflow_buf = get_accelerator().IntTensor([0])
+                # Use apex's multi-tensor applier for efficiency reasons.
+                # Multi-tensor applier takes a function and a list of list
+                # and performs the operation on that list all in one kernel.
+                grad_norm, _ = multi_tensor_applier(
+                    amp_C.multi_tensor_l2norm,
+                    dummy_overflow_buf,
+                    [grads_for_norm],
+                    False # no per-parameter norm
+                )
+            else:
+                grad_norm = torch.norm(grads_for_norm,p=2.0)
             # Since we will be summing across data parallel groups,
             # we need the pow(norm-type).
             total_norm = grad_norm ** norm_type
-
         else:
             for grad in grads_for_norm:
                 grad_norm = torch.norm(grad, norm_type)
@@ -111,11 +116,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
     # Scale.
     clip_coeff = max_norm / (total_norm + 1.0e-6)
     if clip_coeff < 1.0:
-        dummy_overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             dummy_overflow_buf,
-                             [grads, grads],
-                             clip_coeff)
+        if get_accelerator().device_name() == 'cuda':
+            dummy_overflow_buf = get_accelerator().IntTensor([0])
+            multi_tensor_applier(amp_C.multi_tensor_scale,
+                                dummy_overflow_buf,
+                                [grads, grads],
+                                clip_coeff)
+        else:
+            for g in grads:
+                g.detach().mul_(clip_coeff.to(g.device))
 
     return total_norm
 
diff --git a/megatron/optimizer/grad_scaler.py b/megatron/optimizer/grad_scaler.py
index 6b25588db..bc2897f86 100644
--- a/megatron/optimizer/grad_scaler.py
+++ b/megatron/optimizer/grad_scaler.py
@@ -19,14 +19,14 @@
 from abc import abstractmethod
 
 import torch
-
+from deepspeed.accelerator import get_accelerator
 
 class MegatronGradScaler(ABC):
 
     def __init__(self, initial_scale):
         """Initialize scale value with the input initial scale."""
         assert initial_scale > 0.0
-        self._scale = torch.cuda.FloatTensor([initial_scale])
+        self._scale = get_accelerator().FloatTensor([initial_scale])
 
     @property
     def scale(self):
@@ -75,13 +75,13 @@ def __init__(self, initial_scale, min_scale,
         # Lower bound on the scale.
         assert min_scale > 0.0
         assert min_scale <= initial_scale
-        self.min_scale = torch.cuda.FloatTensor([min_scale])
+        self.min_scale = get_accelerator().FloatTensor([min_scale])
         # Growth and backoff factors for the scale.
         assert growth_factor > 1.0
-        self.growth_factor = torch.cuda.FloatTensor([growth_factor])
+        self.growth_factor = get_accelerator().FloatTensor([growth_factor])
         assert backoff_factor < 1.0
         assert backoff_factor > 0.0
-        self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
+        self.backoff_factor = get_accelerator().FloatTensor([backoff_factor])
         # Interval over which if we don't see any inf/nan,
         # we will scale the grad scale by the growth factor.
         assert growth_interval > 0
@@ -128,6 +128,6 @@ def state_dict(self):
 
 
     def load_state_dict(self, state_dict):
-        self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
+        self._scale = state_dict['scale'].to(get_accelerator().current_device_name())
         self._growth_tracker = state_dict['growth_tracker']
         self._hysteresis_tracker = state_dict['hysteresis_tracker']
diff --git a/megatron/optimizer/optimizer.py b/megatron/optimizer/optimizer.py
index 77baddd62..6568bf10a 100644
--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -20,13 +20,11 @@
 
 import torch
 
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
 
 from megatron import get_timers
 from megatron import mpu
 from megatron import print_rank_0
-
+from deepspeed.accelerator import get_accelerator
 from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
 
 
@@ -50,7 +48,10 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
     We don't have a blfoat16 implementation so for now if the overflow_buf
     is not provided, we default back to simple loop copy to be compatible
     with bfloat16."""
-    if overflow_buf:
+    if get_accelerator().device_name() == 'cuda' and overflow_buf:
+        from apex.multi_tensor_apply import multi_tensor_applier
+        import amp_C
+
         overflow_buf.fill_(0)
         # Scaling with factor `1.0` is equivalent to copy.
         multi_tensor_applier(amp_C.multi_tensor_scale,
@@ -204,7 +205,7 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
         # Note that we keep this for the cases that grad scaler is none.
         # We still record nan/inf if we have a bfloat16 with a grad scaler.
         if self.grad_scaler:
-            self.found_inf = torch.cuda.FloatTensor([0.0])
+            self.found_inf = get_accelerator().FloatTensor([0.0])
 
         # Dummy tensor needed for apex multi-apply tensor.
         # For bfloat, we don't have multi-tensor apply and for now
@@ -212,11 +213,11 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
         if bf16:
             self._dummy_overflow_buf = None
         else:
-            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+            self._dummy_overflow_buf = get_accelerator().IntTensor([0])
 
         # In case grad scaler is not passed, define the unity scale.
         if self.grad_scaler is None:
-            self._scale_one = torch.cuda.FloatTensor([1.0])
+            self._scale_one = get_accelerator().FloatTensor([1.0])
 
         # ======================
         # main parameter stuff
@@ -240,8 +241,10 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
                 if param.requires_grad:
 
                     # float16 params:
-                    if param.type() in ['torch.cuda.HalfTensor',
-                                        'torch.cuda.BFloat16Tensor']:
+
+
+                    if param.type() in ['torch.{}.HalfTensor'.format(get_accelerator().device_name()),
+                                        'torch.{}.BFloat16Tensor'.format(get_accelerator().device_name())]:
                         float16_params_this_group.append(param)
                         # Create a copy
                         main_param = param.detach().clone().float()
@@ -259,16 +262,17 @@ def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
                                 = self.optimizer.state.pop(param)
 
                     # fp32 params.
-                    elif param.type() == 'torch.cuda.FloatTensor':
+                    elif param.type() == 'torch.{}.FloatTensor'.format(format(get_accelerator().device_name())):
                         fp32_params_this_group.append(param)
                         param_group['params'][i] = param
 
                     else:
+                        device_name = get_accelerator().device_name()
                         raise TypeError('Wrapped parameters must be one of '
-                                        'torch.cuda.FloatTensor,  '
-                                        'torch.cuda.HalfTensor, or '
-                                        'torch.cuda.BFloat16Tensor. '
-                                        'Received {}'.format(param.type()))
+                                        'torch.{}.FloatTensor,  '
+                                        'torch.{}.HalfTensor, or '
+                                        'torch.{}.BFloat16Tensor. '
+                                        'Received {}'.format(device_name,device_name,device_name,param.type()))
 
             self.float16_groups.append(float16_params_this_group)
             self.fp32_from_float16_groups.append(
@@ -470,7 +474,7 @@ def __init__(self, optimizer, clip_grad,
             optimizer, clip_grad, log_num_zeros_in_grad,
             params_have_main_grad)
 
-        self._scale = torch.cuda.FloatTensor([1.0])
+        self._scale = get_accelerator().FloatTensor([1.0])
 
 
     def zero_grad(self, set_to_none=True):
diff --git a/megatron/p2p_communication.py b/megatron/p2p_communication.py
index 863a60b0a..21df8b2b3 100644
--- a/megatron/p2p_communication.py
+++ b/megatron/p2p_communication.py
@@ -16,7 +16,7 @@
 from functools import reduce
 import operator
 import torch
-
+from deepspeed.accelerator import get_accelerator
 from megatron import get_args
 from megatron import mpu
 
@@ -59,12 +59,12 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
     if recv_prev:
         tensor_recv_prev = torch.empty(tensor_chunk_shape,
                                        requires_grad=True,
-                                       device=torch.cuda.current_device(),
+                                       device=get_accelerator().current_device_name(),
                                        dtype=dtype)
     if recv_next:
         tensor_recv_next = torch.empty(tensor_chunk_shape,
                                        requires_grad=True,
-                                       device=torch.cuda.current_device(),
+                                       device=get_accelerator().current_device_name(),
                                        dtype=dtype)
 
     # Split tensor into smaller chunks if using scatter-gather optimization.
@@ -109,7 +109,7 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
             for req in reqs:
                 req.wait()
     # To protect against race condition when using batch_isend_irecv().
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
 
     # If using scatter-gather optimization, gather smaller chunks.
     if args.scatter_gather_tensors_in_pipeline:
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
index adf04bcb4..95c013e44 100644
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -22,7 +22,6 @@
 
 import torch
 import torch.nn.functional as F
-
 from megatron import get_args
 from megatron import get_tokenizer
 from megatron import mpu
@@ -33,14 +32,14 @@
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron.model import DistributedDataParallel as LocalDDP
 from megatron.model import Float16Module
-
+from deepspeed.accelerator import get_accelerator
 def get_batch(context_tokens):
     """Generate batch from context tokens."""
     args = get_args()
     tokenizer = get_tokenizer()
 
     # Move to GPU.
-    tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().cuda()
+    tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().to(get_accelerator().device_name())
     # Get the attention mask and postition ids.
     attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
         tokens,
@@ -137,7 +136,7 @@ def generate_samples_input_from_file(model):
                 context_length = 0
 
             input_info = [terminate_runs, raw_text_len, context_length]
-            input_info_tensor = torch.cuda.LongTensor(input_info)
+            input_info_tensor = get_accelerator().LongTensor(input_info)
             torch.distributed.all_reduce(input_info_tensor,
                                          group=mpu.get_model_parallel_group())
             terminate_runs = input_info_tensor[0].item()
@@ -154,14 +153,14 @@ def generate_samples_input_from_file(model):
                 if mpu.is_pipeline_first_stage():
                     src = mpu.get_pipeline_model_parallel_first_rank()
                     group = mpu.get_pipeline_model_parallel_group()
-                    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+                    context_tokens_tensor = get_accelerator().LongTensor(context_tokens)
                     torch.distributed.broadcast(context_tokens_tensor, src, group)
                 else:
                     src = mpu.get_pipeline_model_parallel_first_rank()
                     group = mpu.get_pipeline_model_parallel_group()
                     context_tokens_tensor = torch.empty(context_length,
                                                         dtype=torch.int64,
-                                                        device=torch.device("cuda"))
+                                                        device=get_accelerator().current_device_name())
                     torch.distributed.broadcast(context_tokens_tensor, src, group)
                     context_tokens = context_tokens_tensor.cpu().numpy().tolist()
 
@@ -259,7 +258,7 @@ def generate_samples_interactive(model, print_frequency=24):
                 context_length = 0
 
             input_info = [terminate_runs, raw_text_len, context_length]
-            input_info_tensor = torch.cuda.LongTensor(input_info)
+            input_info_tensor = get_accelerator().LongTensor(input_info)
             torch.distributed.all_reduce(input_info_tensor,
                                          group=mpu.get_model_parallel_group())
             terminate_runs = input_info_tensor[0].item()
@@ -276,14 +275,14 @@ def generate_samples_interactive(model, print_frequency=24):
                 if mpu.is_pipeline_first_stage():
                     src = mpu.get_pipeline_model_parallel_first_rank()
                     group = mpu.get_pipeline_model_parallel_group()
-                    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+                    context_tokens_tensor = get_accelerator().LongTensor(context_tokens)
                     torch.distributed.broadcast(context_tokens_tensor, src, group)
                 else:
                     src = mpu.get_pipeline_model_parallel_first_rank()
                     group = mpu.get_pipeline_model_parallel_group()
                     context_tokens_tensor = torch.empty(context_length,
                                                         dtype=torch.int64,
-                                                        device=torch.device("cuda"))
+                                                        device=torch.device(get_accelerator().device_name()))
                     torch.distributed.broadcast(context_tokens_tensor, src, group)
                     context_tokens = context_tokens_tensor.cpu().numpy().tolist()
 
@@ -333,12 +332,12 @@ def generate_samples_unconditional(model, latencies=[], model_latencies=[], sing
                       for _ in range(args.micro_batch_size)]
     ctr = 0
     while True:
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         start_time = time.time()
         for token_stream in get_token_stream(model,
                                              copy.deepcopy(context_tokens), model_latencies=model_latencies, single_token_latency=single_token_latency):
             pass
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         latencies.append(time.time() - start_time)
         start_time = time.time()
         if mpu.is_pipeline_last_stage() and \
@@ -400,8 +399,8 @@ def get_token_stream(model, context_tokens, model_latencies=[], single_token_lat
     context_tokens, context_lengths = pad_batch(context_tokens,
                                                 tokenizer.eod, args)
 
-    context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
-    context_length_tensor = torch.cuda.LongTensor(context_lengths)
+    context_tokens_tensor = get_accelerator().LongTensor(context_tokens)
+    context_length_tensor = get_accelerator().LongTensor(context_lengths)
 
     torch.distributed.broadcast(context_length_tensor,
                                 mpu.get_tensor_model_parallel_src_rank(),
@@ -422,10 +421,10 @@ def get_token_stream(model, context_tokens, model_latencies=[], single_token_lat
     t0=time.time()
     for tokens, lengths in batch_token_iterator:
         if count > 1:
-           torch.cuda.synchronize()
+           get_accelerator().synchronize()
            t_elapsed = time.time() - t0
            single_token_latency.append(t_elapsed)
-        torch.cuda.synchronize()
+        get_accelerator().synchronize()
         t0=time.time()
         count+=1
         context_length += 1
@@ -447,7 +446,7 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids,
 
     # Hidden size changes when not using recompute, need to tell p2p_communicate
     # functions the correct size
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     t0 = time.time()
     args = get_args()
     orig_seq_length = args.seq_length
@@ -476,7 +475,7 @@ def forward_step(model, tokens, position_ids, attention_mask, tokentype_ids,
     send_forward(output_tensor)
 
     args.seq_length = orig_seq_length
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     model_latencies.append(time.time()-t0)
     if get_key_value:
         return output_tensor, layer_past
@@ -506,14 +505,14 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
 
         layer_past = None
         batch_size = context_tokens.size(0)
-        is_done = torch.zeros([batch_size]).byte().cuda()
+        is_done = torch.zeros([batch_size]).byte().to(get_accelerator().device_name())
         tokens = context_tokens
         if maxlen is None:
             maxlen = args.seq_length - 1
             if maxlen > (org_context_length + args.out_seq_length):
                 maxlen = org_context_length + args.out_seq_length
 
-        lengths = torch.ones([batch_size]).long().cuda() * maxlen
+        lengths = torch.ones([batch_size]).long().to(get_accelerator().device_name()) * maxlen
 
         while context_length <= (maxlen):
             if args.recompute:
@@ -593,7 +592,7 @@ def sample_sequence_batch(model, context_tokens, context_lengths,
                 else:
                     yield None, None
 
-                done = torch.cuda.ByteTensor([0])
+                done = get_accelerator().ByteTensor([0])
                 src = mpu.get_pipeline_model_parallel_last_rank()
                 group = mpu.get_pipeline_model_parallel_group()
                 torch.distributed.broadcast(done, src, group)
diff --git a/megatron/training.py b/megatron/training.py
index 92d4ba2a8..adf84699f 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -52,7 +52,7 @@
 from megatron.schedules import forward_backward_pipelining_without_interleaving
 from megatron.schedules import forward_backward_pipelining_with_interleaving
 from megatron.utils import report_memory, throughput_calculator, checkpoint_throughput_calculator
-
+from deepspeed.accelerator import get_accelerator
 import deepspeed
 from deepspeed.compression.compress import init_compression, redundancy_clean
 
@@ -105,7 +105,7 @@ def pretrain(train_valid_test_dataset_provider,
     # This will be closer to what scheduler will see (outside of
     # image ... launches.
     global _TRAIN_START_TIME
-    start_time_tensor = torch.cuda.FloatTensor([_TRAIN_START_TIME])
+    start_time_tensor = get_accelerator().FloatTensor([_TRAIN_START_TIME])
     torch.distributed.all_reduce(start_time_tensor,
                                  op=torch.distributed.ReduceOp.MIN)
     _TRAIN_START_TIME = start_time_tensor.item()
@@ -326,14 +326,15 @@ def get_model(model_provider_func):
 
     # GPU allocation.
     for model_module in model:
-        model_module.cuda(torch.cuda.current_device())
+        model_module.to(get_accelerator().current_device_name())
+ 
 
     # Fp16 conversion.
     if args.fp16 or args.bf16:
         model = [Float16Module(model_module, args) for model_module in model]
 
     if args.DDP_impl == 'torch':
-        i = torch.cuda.current_device()
+        i = get_accelerator().current_device()
         model = [torchDDP(model_module, device_ids=[i], output_device=i,
                           process_group=mpu.get_data_parallel_group())
                  for model_module in model]
@@ -712,7 +713,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
     for key in loss_dict:
         if not skipped_iter:
             total_loss_dict[key] = total_loss_dict.get(
-                key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
+                key, get_accelerator().FloatTensor([0.0])) + loss_dict[key]
         else:
             value = loss_dict[key].float().sum().item()
             is_nan = value == float('inf') or \
@@ -848,23 +849,23 @@ def add_to_logging(name):
             # print('step {} rank {} before sync opt_stats {}, {}'.format(iteration, torch.distributed.get_rank(), opt_stats_2, opt_stats))
             if args.zero_stage > 0:
                 # ZeRO partiions optimizer states
-                opt_stats = torch.cuda.FloatTensor(opt_stats)
+                opt_stats = get_accelerator().FloatTensor(opt_stats)
                 torch.distributed.all_reduce(opt_stats, group=mpu.get_data_parallel_group())
-                opt_stats_2 = torch.cuda.FloatTensor(opt_stats_2)
+                opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2)
                 torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX,
                     group=mpu.get_data_parallel_group())
 
             if args.tensor_model_parallel_size > 1:
-                opt_stats = torch.cuda.FloatTensor(opt_stats)
+                opt_stats = get_accelerator().FloatTensor(opt_stats)
                 torch.distributed.all_reduce(opt_stats, group=mpu.get_tensor_model_parallel_group())
-                opt_stats_2 = torch.cuda.FloatTensor(opt_stats_2)
+                opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2)
                 torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX,
                     group=mpu.get_tensor_model_parallel_group())
 
             if args.pipeline_model_parallel_size > 1:
-                opt_stats = torch.cuda.FloatTensor(opt_stats)
+                opt_stats = get_accelerator().FloatTensor(opt_stats)
                 torch.distributed.all_reduce(opt_stats, group=mpu.get_pipeline_model_parallel_group())
-                opt_stats_2 = torch.cuda.FloatTensor(opt_stats_2)
+                opt_stats_2 = get_accelerator().FloatTensor(opt_stats_2)
                 torch.distributed.all_reduce(opt_stats_2, op=torch.distributed.ReduceOp.MAX,
                     group=mpu.get_pipeline_model_parallel_group())
 
@@ -939,7 +940,7 @@ def add_to_logging(name):
                       float(max(1, total_loss_dict[advanced_iters_key]))
                 if avg > 0.0:
                     log_string += ' {}: {:.6E} |'.format(key, avg)
-                total_loss_dict[key] = torch.cuda.FloatTensor([0.0])
+                total_loss_dict[key] = get_accelerator().FloatTensor([0.0])
         log_string += ' loss scale: {:.1f} |'.format(loss_scale)
         if grad_norm is not None:
             log_string += ' grad norm: {:.3f} |'.format(grad_norm)
@@ -1103,7 +1104,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
         # Exiting based on duration
         if args.exit_duration_in_mins:
             train_time = (time.time() - _TRAIN_START_TIME) / 60.0
-            done_cuda = torch.cuda.IntTensor(
+            done_cuda = get_accelerator().IntTensor(
                 [train_time > args.exit_duration_in_mins])
             torch.distributed.all_reduce(
                 done_cuda, op=torch.distributed.ReduceOp.MAX)
@@ -1180,7 +1181,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
                     for key in loss_dict:
                         if 'moe' not in key:
                             total_loss_dict[key] = total_loss_dict.get(
-                                key, torch.cuda.FloatTensor([0.0])) + loss_dict[key]
+                                key, get_accelerator().FloatTensor([0.0])) + loss_dict[key]
 
             args.consumed_valid_samples += mpu.get_data_parallel_world_size() \
                                            * args.micro_batch_size \
@@ -1299,10 +1300,10 @@ def build_train_valid_test_data_iterators(
         do_valid = valid_dataloader is not None and args.eval_iters > 0
         do_test = test_dataloader is not None and args.eval_iters > 0
         # Need to broadcast num_tokens and num_type_tokens.
-        flags = torch.cuda.LongTensor(
+        flags = get_accelerator().LongTensor(
             [int(do_train), int(do_valid), int(do_test)])
     else:
-        flags = torch.cuda.LongTensor([0, 0, 0])
+        flags = get_accelerator().LongTensor([0, 0, 0])
 
     # Broadcast num tokens.
     torch.distributed.broadcast(flags,
diff --git a/megatron/utils.py b/megatron/utils.py
index 99c9438bc..6657a289d 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -20,8 +20,10 @@
 import torch
 from torch.nn.parallel import DistributedDataParallel as torchDDP
 
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
+from deepspeed.accelerator import get_accelerator
+if get_accelerator().device_name == 'cuda':
+    from apex.multi_tensor_apply import multi_tensor_applier
+    import amp_C
 
 from megatron import get_args
 from megatron import print_rank_0
@@ -30,7 +32,7 @@
 from megatron.model.module import param_is_not_shared
 from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
 from megatron import get_num_microbatches
-
+from deepspeed.accelerator import get_accelerator
 def unwrap_model(model, module_instances=(torchDDP)):
     return_list = True
     if not isinstance(model, list):
@@ -63,13 +65,18 @@ def calc_params_l2_norm(model):
                 else:
                     params_data.append(param.data)
     # Calculate norm
-    dummy_overflow_buf = torch.cuda.IntTensor([0])
-    norm, _ = multi_tensor_applier(
-        amp_C.multi_tensor_l2norm,
-        dummy_overflow_buf,
-        [params_data],
-        False # no per-parameter norm
-    )
+    dummy_overflow_buf = get_accelerator().IntTensor([0])
+    
+    if get_accelerator().device_name == 'cuda':
+
+        norm, _ = multi_tensor_applier(
+            amp_C.multi_tensor_l2norm,
+            dummy_overflow_buf,
+            [params_data],
+            False # no per-parameter norm
+        )
+    else :
+        norm = torch.norm(params_data,p=2.0)
     norm_2 = norm * norm
     # Sum across all model-parallel GPUs.
     torch.distributed.all_reduce(norm_2,
@@ -95,13 +102,13 @@ def report_memory(name):
     mega_bytes = 1024.0 * 1024.0
     string = name + ' memory (MB)'
     string += ' | allocated: {}'.format(
-        torch.cuda.memory_allocated() / mega_bytes)
+        get_accelerator().memory_allocated() / mega_bytes)
     string += ' | max allocated: {}'.format(
-        torch.cuda.max_memory_allocated() / mega_bytes)
+        get_accelerator().max_memory_allocated() / mega_bytes)
     string += ' | reserved: {}'.format(
-        torch.cuda.memory_reserved() / mega_bytes)
+        get_accelerator().memory_reserved() / mega_bytes)
     string += ' | max reserved: {}'.format(
-        torch.cuda.max_memory_reserved() / mega_bytes)
+        get_accelerator().max_memory_reserved() / mega_bytes)
     if mpu.get_data_parallel_rank() == 0:
         print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
               flush=True)
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 369152c8d..355c41450 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -31,6 +31,7 @@
 
 import deepspeed
 from deepspeed.runtime.utils import see_memory_usage
+from deepspeed.accelerator.real_accelerator import get_accelerator
 import os
 import subprocess
 
@@ -62,7 +63,7 @@ def model_provider(pre_process=True, post_process=True):
             # pipeline it as an activation during training. The mask is constant, and thus
             # we can reuse it.
             attention_mask = torch.tril(torch.ones(
-                (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view(
+                (1, args.seq_length, args.seq_length), device=get_accelerator().current_device_name())).view(
                     1, 1, args.seq_length, args.seq_length)
 
             # Convert attention mask to binary:
diff --git a/pretrain_ict.py b/pretrain_ict.py
index 1438b3d57..5ed02356a 100644
--- a/pretrain_ict.py
+++ b/pretrain_ict.py
@@ -29,7 +29,7 @@
 from megatron.model.biencoder_model import biencoder_model_provider
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group
-
+from deepspeed.accelerator import get_accelerator
 
 def pretrain_ict_model_provider():
     args = get_args()
@@ -89,8 +89,8 @@ def forward_step(data_iterator, model, input_tensor):
     timers('batch-generator').stop()
 
     # Query and Context Types
-    query_types = torch.cuda.LongTensor(*query_tokens.shape).fill_(0)
-    context_types = torch.cuda.LongTensor(*context_tokens.shape).fill_(0)
+    query_types = get_accelerator().LongTensor(*query_tokens.shape).fill_(0)
+    context_types = get_accelerator().LongTensor(*context_tokens.shape).fill_(0)
 
     # Forward model.
     query_logits, context_logits = model(query_tokens, query_mask,
@@ -118,12 +118,12 @@ def forward_step(data_iterator, model, input_tensor):
                                     k=softmax_scores.shape[1], sorted=True)
 
     def topk_accuracy(k):
-        return torch.cuda.FloatTensor([sum([int(i in sorted_indices[i, :k]) \
+        return get_accelerator().FloatTensor([sum([int(i in sorted_indices[i, :k]) \
             for i in range(global_batch_size)]) / global_batch_size])
 
     topk_accs = [topk_accuracy(int(k)) for k in args.retriever_report_topk_accuracies]
 
-    labels = torch.arange(global_batch_size).long().cuda()
+    labels = torch.arange(global_batch_size).long().to(get_accelerator().device_name())
     loss = F.nll_loss(softmax_scores, labels, reduction='mean')
     reduced_losses = average_losses_across_data_parallel_group([loss, *topk_accs])
 
diff --git a/pretrain_vit.py b/pretrain_vit.py
index 16ec10439..f086d18e1 100644
--- a/pretrain_vit.py
+++ b/pretrain_vit.py
@@ -37,8 +37,8 @@ def get_batch(data_iterator):
     data = next(data_iterator)
 
     # only data parallelism; no need for broadcast
-    images = data[0].cuda()
-    labels = data[1].cuda()
+    images = data[0].to(get_accelerator().device_name())
+    labels = data[1].to(get_accelerator().device_name())
 
     return images, labels
 
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index ca28e9ee3..394261e3f 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -55,7 +55,7 @@ def __init__(self, model, tokenizer):
         self.cache_hook = CacheHook(None)
         self.is_main = args.rank == 0
         self.is_local_main = args.local_rank == 0
-        self._device = torch.cuda.current_device()
+        self._device = get_accelerator().current_device_name()
         self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
         self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
         self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py
index 7549f4a09..6c15732c6 100644
--- a/tasks/eval_utils.py
+++ b/tasks/eval_utils.py
@@ -27,6 +27,7 @@
 from megatron.schedules import get_forward_backward_func
 from tasks.finetune_utils import build_data_loader
 from tasks.finetune_utils import process_batch
+from deepspeed.accelerator import get_accelerator
 
 
 def accuracy_func_provider(single_dataset_provider):
@@ -172,7 +173,7 @@ def correct_answers_forward_step(batch, model):
 
     # Reduce.
     if mpu.is_pipeline_last_stage():
-        unreduced = torch.cuda.LongTensor([correct, total])
+        unreduced = get_accelerator().LongTensor([correct, total])
         torch.distributed.all_reduce(unreduced,
                                      group=mpu.get_data_parallel_group())
 
diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py
index df9210287..fd4e79a98 100644
--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
@@ -32,16 +32,16 @@
 from megatron.utils import average_losses_across_data_parallel_group
 from megatron.utils import calc_params_l2_norm
 from megatron.utils import check_adlr_autoresume_termination
-
+from deepspeed.accelerator import get_accelerator
 
 def process_batch(batch):
     """Process batch and produce inputs for the model."""
     args = get_args()
 
-    tokens = batch['text'].long().cuda().contiguous()
-    types = batch['types'].long().cuda().contiguous()
-    labels = batch['label'].long().cuda().contiguous()
-    attention_mask = batch['padding_mask'].float().cuda().contiguous()
+    tokens = batch['text'].long().to(get_accelerator().device_name()).contiguous()
+    types = batch['types'].long().to(get_accelerator().device_name()).contiguous()
+    labels = batch['label'].long().to(get_accelerator().device_name()).contiguous()
+    attention_mask = batch['padding_mask'].float().to(get_accelerator().device_name()).contiguous()
     if args.fp16:
         attention_mask = attention_mask.half()
 
diff --git a/tasks/orqa/evaluate_utils.py b/tasks/orqa/evaluate_utils.py
index ebee03522..986ae1b93 100644
--- a/tasks/orqa/evaluate_utils.py
+++ b/tasks/orqa/evaluate_utils.py
@@ -25,7 +25,7 @@
 from megatron.data.realm_index import OpenRetreivalDataStore, FaissMIPSIndex
 from megatron.model.biencoder_model import biencoder_model_provider
 from megatron.training import get_model
-
+from deepspeed.accelerator import get_accelerator
 class ORQAEvaluator(object):
     def __init__(self):
         args = get_args()
@@ -121,7 +121,7 @@ def evaluate(self, qa_data, split):
                                                                     split)
         local_rank = args.local_rank
         rank = torch.distributed.get_rank()
-        device_count = torch.cuda.device_count()
+        device_count = get_accelerator().device_count()
         num_nodes = torch.distributed.get_world_size() // device_count
         node_id = rank // device_count
 
@@ -145,14 +145,14 @@ def evaluate(self, qa_data, split):
             distance, topkindex = self.mips_index.search_mips_index(
                 all_query_tensor, top_k=args.faiss_topk_retrievals, 
                 reconstruct=False)
-            distance = torch.from_numpy(distance).cuda()
-            topkindex = torch.LongTensor(topkindex).cuda()
+            distance = torch.from_numpy(distance).to(get_accelerator().device_name())
+            topkindex = torch.LongTensor(topkindex).to(get_accelerator().device_name())
 
         if local_rank != 0:
             distance = torch.empty(device_count * len(query_tensor), \
-                args.faiss_topk_retrievals, dtype=torch.float32).cuda()
+                args.faiss_topk_retrievals, dtype=torch.float32).to(get_accelerator().device_name())
             topkindex = torch.empty(device_count * len(query_tensor), \
-                args.faiss_topk_retrievals, dtype=torch.int64).cuda()
+                args.faiss_topk_retrievals, dtype=torch.int64).to(get_accelerator().device_name())
 
         torch.distributed.broadcast(distance, src=device_start_rank, \
             group=group)
diff --git a/tasks/orqa/natural_questions/nq.py b/tasks/orqa/natural_questions/nq.py
index ca07fe416..5282887ee 100644
--- a/tasks/orqa/natural_questions/nq.py
+++ b/tasks/orqa/natural_questions/nq.py
@@ -28,6 +28,7 @@
 
 from megatron import print_rank_0, get_args, get_tokenizer, mpu
 from megatron.data.biencoder_dataset_utils import make_attention_mask
+from deepspeed.accelerator import get_accelerator
 
 def get_nq_dataset(qa_data, split):
     args = get_args()
@@ -42,10 +43,10 @@ def get_nq_dataset(qa_data, split):
 
 
 def process_nq_batch(batch):
-    query_tokens = batch['token_ids'].long().cuda()
-    query_mask = (batch['token_mask'] < 0.5).cuda()
-    query_types = batch['token_types'].long().cuda()
-    query_len = batch['seq_len'].long().cuda()
+    query_tokens = batch['token_ids'].long().to(get_accelerator().device_name())
+    query_mask = (batch['token_mask'] < 0.5).to(get_accelerator().device_name())
+    query_types = batch['token_types'].long().to(get_accelerator().device_name())
+    query_len = batch['seq_len'].long().to(get_accelerator().device_name())
     reference = batch['reference']
 
     return query_tokens, query_mask, query_types, query_len, reference
diff --git a/tasks/vision/eval_utils.py b/tasks/vision/eval_utils.py
index aabc04a15..37024a709 100644
--- a/tasks/vision/eval_utils.py
+++ b/tasks/vision/eval_utils.py
@@ -23,7 +23,7 @@
 from tasks.vision.finetune_utils import build_data_loader
 from tasks.vision.finetune_utils import process_batch
 from torchvision import datasets, transforms
-
+from deepspeed.accelerator import get_accelerator
 
 def accuracy_func_provider():
     """Provide function that calculates accuracies."""
@@ -86,7 +86,7 @@ def calculate_correct_answers(model, dataloader, epoch):
     model.train()
 
     # Reduce.
-    unreduced = torch.cuda.LongTensor([correct, total])
+    unreduced = get_accelerator().LongTensor([correct, total])
     torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group())
 
     # Print on screen.
diff --git a/tasks/vision/finetune_utils.py b/tasks/vision/finetune_utils.py
index afde4aa89..47744ef1d 100644
--- a/tasks/vision/finetune_utils.py
+++ b/tasks/vision/finetune_utils.py
@@ -29,12 +29,12 @@
 from megatron.training import training_log
 from megatron.utils import check_adlr_autoresume_termination
 from megatron.utils import average_losses_across_data_parallel_group
-
+from deepspeed.accelerator import get_accelerator
 
 def process_batch(batch):
     """Process batch and produce inputs for the model."""
-    images = batch[0].cuda().contiguous()
-    labels = batch[1].cuda().contiguous()
+    images = batch[0].to(get_accelerator().device_name()).contiguous()
+    labels = batch[1].to(get_accelerator().device_name()).contiguous()
     return images, labels
 
 
diff --git a/tasks/zeroshot_gpt/evaluate.py b/tasks/zeroshot_gpt/evaluate.py
index 6366cfb60..4e1e763ce 100644
--- a/tasks/zeroshot_gpt/evaluate.py
+++ b/tasks/zeroshot_gpt/evaluate.py
@@ -29,7 +29,7 @@
 from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
 from megatron.p2p_communication import recv_forward, send_forward
 from tasks.finetune_utils import build_data_loader
-
+from deepspeed.accelerator import get_accelerator
 from .datasets import build_dataset
 
 # These are needed to unwrap the model, would be nice to put these in megatron.utils if possible?
@@ -66,8 +66,8 @@ def process_batch(batch):
     args = get_args()
     tokenizer = get_tokenizer()
 
-    loss_mask = batch['pad_mask'].long().cuda().contiguous().byte()
-    tokens_ = batch['text'].long().cuda().contiguous()
+    loss_mask = batch['pad_mask'].long().to(get_accelerator().device_name()).contiguous().byte()
+    tokens_ = batch['text'].long().to(get_accelerator().device_name()).contiguous()
     labels = tokens_[:, 1:].contiguous()
     tokens = tokens_[:, :-1].contiguous()
 

From 57e64391037e7ffd9005a87ae7f1d3009bfddaa7 Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 21 Feb 2023 10:38:09 -0800
Subject: [PATCH 11/11] data efficiency example update (#113)

---
 examples/data_efficiency/README.md            |   2 +-
 .../ds_config_bert_TEMPLATE.json              |  24 +++
 .../finetune_glue/ds_finetune_bert_glue.sh    | 156 ++++++++++++++
 .../ds_finetune_bert_glue_run.sh              |  44 ++++
 .../ds_finetune_gather_result.py              | 118 +++++++++++
 .../pretrain/ds_pretrain_bert_336M_run.sh     | 190 ++++++++++++++----
 tasks/eval_harness/evaluate.py                |   3 +-
 tasks/eval_utils.py                           | 110 +++++++---
 tasks/finetune_utils.py                       |  42 ++++
 tasks/glue/cola.py                            |  90 +++++++++
 tasks/glue/finetune.py                        |  59 +++++-
 tasks/glue/mrpc.py                            | 101 ++++++++++
 tasks/glue/qnli.py                            | 101 ++++++++++
 tasks/glue/rte.py                             | 101 ++++++++++
 tasks/glue/sst2.py                            |  95 +++++++++
 tasks/glue/stsb.py                            | 100 +++++++++
 tasks/main.py                                 |   2 +-
 17 files changed, 1275 insertions(+), 63 deletions(-)
 create mode 100644 examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json
 create mode 100644 examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh
 create mode 100644 examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh
 create mode 100644 examples/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py
 create mode 100644 tasks/glue/cola.py
 create mode 100644 tasks/glue/mrpc.py
 create mode 100644 tasks/glue/qnli.py
 create mode 100644 tasks/glue/rte.py
 create mode 100644 tasks/glue/sst2.py
 create mode 100644 tasks/glue/stsb.py

diff --git a/examples/data_efficiency/README.md b/examples/data_efficiency/README.md
index e10db17d5..7ed96ae72 100644
--- a/examples/data_efficiency/README.md
+++ b/examples/data_efficiency/README.md
@@ -20,4 +20,4 @@ The ``ds_analyze_bert_data_map.sh`` and ``ds_analyze_bert_data_reduce.sh`` are u
 
 ``bert/pretrain`` includes the pretraining example scripts. You can choose a setup to run by uncommenting one block in ``ds_pretrain_bert_336M_run.sh``. One thing to note is that in our [random-LTD paper](https://arxiv.org/abs/2211.11586) we did not scale peak learning rate when using less than 100% data, while in our later [data efficiency paper](https://arxiv.org/abs/2212.03597) we find that scaling LR based on used percentage of data helps improve model quality.
 
-``bert/finetune`` includes the finetuning example scripts.
\ No newline at end of file
+``bert/finetune`` includes the MNLI/QQP/RACE finetuning example scripts following the [Megatron-LM paper](https://arxiv.org/abs/1909.08053). However, we found that the RACE task's accuracy is not very stable and the Megatron-LM paper used a very long number of epochs for MNLI/QQP which is not necessary. Thus we added capability of finetuning other GLUE tasks, and switched to follow the hyperparameters of the [original BERT paper](https://arxiv.org/abs/1810.04805). The corresponding scripts are at ``bert/finetune_glue``, which we recommend to use instead of ``bert/finetune``. Our [data efficiency paper](https://arxiv.org/abs/2212.03597) also uses the scripts under ``bert/finetune_glue`` for GLUE finetuning.
\ No newline at end of file
diff --git a/examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json b/examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json
new file mode 100644
index 000000000..2700805d1
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune_glue/ds_config_bert_TEMPLATE.json
@@ -0,0 +1,24 @@
+{
+  "train_batch_size" : CONFIG_BATCH_SIZE,
+  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+
+  "zero_optimization": {
+    "stage": ZERO_STAGE,
+    "elastic_checkpoint": true
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+
+  "wall_clock_breakdown" : false
+}
diff --git a/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh b/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh
new file mode 100644
index 000000000..0e0c571a4
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue.sh
@@ -0,0 +1,156 @@
+hostname_and_rank=$1
+master_port=$2
+seed=$3
+task=$4
+lr=$5
+pretrained_checkpoint=$6
+
+# hostname_and_rank="worker-0:0,1,2,3"
+# master_port=12345
+# seed=1234
+# task="MNLI"
+# lr=2e-5
+# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+###############################################################################
+### Main configs
+seq_len=512
+
+global_batch_size=32
+epochs=3
+
+train_data="/blob/data/GlueData/${task}/train.tsv"
+valid_data="/blob/data/GlueData/${task}/dev.tsv"
+if [[ "${task}" = "MNLI" ]]; then
+valid_data="/blob/data/GlueData/MNLI/dev_matched.tsv \
+            /blob/data/GlueData/MNLI/dev_mismatched.tsv"
+fi
+
+## Adjust based on number of GPUs.
+batch_size=8
+
+## BERT 110M (BERT-Base)
+# model_size=0.11
+# num_layers=12
+# hidden_size=768
+# num_attn_heads=12
+
+## BERT 336M (BERT-Large)
+model_size=0.336
+num_layers=24
+hidden_size=1024
+num_attn_heads=16
+
+## BERT 1.3B
+# model_size=1.3
+# num_layers=24
+# hidden_size=2048
+# num_attn_heads=32
+
+## BERT 3.9B
+# model_size=3.9
+# num_layers=48
+# hidden_size=2560
+# num_attn_heads=40
+###############################################################################
+### Parallelism configs
+## Model parallelism, 1 is no MP
+mp_size=1
+
+## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true.
+## Currently pipeline parallelism is not supported for BERT model: DeepSpeed's
+## pipeline parallelism is only integrated with the GPT case, and currently
+## DeepSpeed is not integrated with Megatron's own pipeline parallelism.
+pp_size=1
+no_pp="true"
+
+## ZeRO stage
+zero_stage=0
+###############################################################################
+### Misc configs
+log_interval=10
+eval_iters=50
+eval_interval=100
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+# activation_checkpoint="true"
+activation_checkpoint="false"
+###############################################################################
+vocab_file="bert-large-uncased-vocab.txt"
+if [ ! -f "$vocab_file" ]; then
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt
+fi
+
+jobname="${task}-bsz${global_batch_size}-lr${lr}-epochs${epochs}-seed${seed}"
+# output_path="${pretrained_checkpoint}-finetune-glue-4v100/${jobname}"
+output_path=$(basename "$pretrained_checkpoint")
+output_path="glue-results/${output_path}-finetune-glue-4v100/${jobname}"
+mkdir -p ${output_path}
+
+template_json="ds_config_bert_TEMPLATE.json"
+config_json="ds_config_bert_bsz${global_batch_size}_mbsz${batch_size}_log${log_interval}_zero${zero_stage}.json"
+if [[ $zero_stage -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
+    | sed "s/LOG_INTERVAL/${log_interval}/" \
+    | sed "s/ZERO_STAGE/${zero_stage}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+      > ${config_json}
+fi
+
+options=" \
+    --finetune \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${zero_stage} \
+    --task ${task} \
+    --seed ${seed} \
+    --train-data ${train_data} \
+    --valid-data ${valid_data} \
+    --tokenizer-type BertWordPieceLowerCase \
+    --vocab-file ${vocab_file} \
+    --epochs ${epochs} \
+    --pretrained-checkpoint ${pretrained_checkpoint} \
+    --tensor-model-parallel-size ${mp_size} \
+    --pipeline-model-parallel-size ${pp_size} \
+    --num-layers ${num_layers} \
+    --hidden-size ${hidden_size} \
+    --num-attention-heads ${num_attn_heads} \
+    --global-batch-size ${global_batch_size} \
+    --micro-batch-size ${batch_size} \
+    --lr ${lr} \
+    --lr-decay-style linear \
+    --lr-warmup-fraction 0.1 \
+    --seq-length ${seq_len} \
+    --max-position-embeddings ${seq_len} \
+    --log-interval ${log_interval} \
+    --eval-interval ${eval_interval} \
+    --eval-iters ${eval_iters} \
+    --weight-decay 1.0e-1 \
+    --fp16"
+
+if [ "${activation_checkpoint}" = "true" ]; then
+options="${options} \
+    --checkpoint-activations \
+    --deepspeed-activation-checkpointing"
+fi
+
+if [[ "${no_pp}" = "true" ]]; then
+options="${options} \
+    --no-pipeline-parallel"
+fi
+
+# After the fine-tuning finishes, you can find the dev set accuracy numbers by
+# "grep -e "overall:" -e "metrics for" ${output_path}/output.log"
+deepspeed --include=${hostname_and_rank} --master_port=${master_port} ../../../../tasks/main.py ${options} &> ${output_path}/output.log
diff --git a/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh b/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh
new file mode 100644
index 000000000..10e04f2c7
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune_glue/ds_finetune_bert_glue_run.sh
@@ -0,0 +1,44 @@
+hostname_and_rank=$1
+master_port=$2
+pretrained_checkpoint=$3
+
+# hostname_and_rank="worker-0:0,1,2,3"
+# master_port=12345
+# pretrained_checkpoint="/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp"
+
+tasks=(
+    RTE
+    MRPC
+    STS-B
+    CoLA
+    SST-2
+    QNLI
+    QQP
+    MNLI
+)
+
+seeds=(
+    1234
+    1235
+    1236
+    1237
+    1238
+)
+
+lrs=(
+    2e-5
+    3e-5
+    4e-5
+    5e-5
+)
+
+for ((i=0;i<${#tasks[@]};++i)); do
+    task=${tasks[i]}
+    for ((j=0;j<${#seeds[@]};++j)); do
+        seed=${seeds[j]}
+        for ((k=0;k<${#lrs[@]};++k)); do
+            lr=${lrs[k]}
+            bash ds_finetune_bert_glue.sh ${hostname_and_rank} ${master_port} ${seed} ${task} ${lr} ${pretrained_checkpoint}
+        done
+    done
+done
\ No newline at end of file
diff --git a/examples/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py b/examples/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py
new file mode 100644
index 000000000..b359ecb6f
--- /dev/null
+++ b/examples/data_efficiency/bert/finetune_glue/ds_finetune_gather_result.py
@@ -0,0 +1,118 @@
+import os
+import statistics
+
+def gather_numbers(fname, match_keywords, index_keywords, index_offsets):
+    results = {}
+    for k in index_keywords:
+        results[k] = []
+    file1 = open(fname, 'r')
+    while True:
+        line = file1.readline()
+        if not line:
+            break
+        splits = line.split(' ')
+        for i in range(len(match_keywords)):
+            if match_keywords[i] in line:
+                ref_idx = splits.index(index_keywords[i])
+                results[index_keywords[i]].append(float(splits[ref_idx+index_offsets[i]]))
+    file1.close()
+    return results
+
+def gather_GLUE_results(result_path, key, lr):
+    result = []
+    mnli_matched_result = []
+    mnli_mismatched_result = []
+    for file in os.listdir(result_path):
+        if file.startswith(key) and lr in file:
+            fname = f'{result_path}/{file}/output.log'
+            if os.path.exists(fname):
+                if key == "STS-B":
+                    results = gather_numbers(fname, ['metrics for'], ['spearmanr'], [2])
+                    overall_candidate = results['spearmanr']
+                    overall_candidate = [x * 100.0 for x in overall_candidate]
+                elif key == "CoLA":
+                    results = gather_numbers(fname, ['metrics for'], ['mcc'], [2])
+                    overall_candidate = results['mcc']
+                    overall_candidate = [x * 100.0 for x in overall_candidate]
+                elif key == "MNLI":
+                    results = gather_numbers(fname,
+                        ['overall:', 'metrics for dev-matched:', 'metrics for dev-mismatched:'],
+                        ['overall:', 'dev-matched:', 'dev-mismatched:'],
+                        [9, 9, 9])
+                    overall_candidate = results['overall:']
+                    matched_candidate = results['dev-matched:']
+                    mismatched_candidate = results['dev-mismatched:']
+                else:
+                    results = gather_numbers(fname, ['overall:'], ['overall:'], [9])
+                    overall_candidate = results['overall:']
+                if len(overall_candidate) > 0:
+                    if len(overall_candidate) != 3:
+                        print(f"{result_path} task {key} lr {lr} only has {len(overall_candidate)} epoch")
+                    best_index = overall_candidate.index(max(overall_candidate))
+                    result.append(overall_candidate[best_index])
+                    if key == "MNLI":
+                        mnli_matched_result.append(matched_candidate[best_index])
+                        mnli_mismatched_result.append(mismatched_candidate[best_index])
+    if len(result) > 0:
+        if len(result) != 5:
+            print(f"{result_path} task {key} lr {lr} only has {len(result)} seed")
+        if key == "MNLI":
+            best_index = result.index(statistics.median_high(result))
+            return round(mnli_matched_result[best_index],2), round(statistics.stdev(mnli_matched_result),2), round(mnli_mismatched_result[best_index],2), round(statistics.stdev(mnli_mismatched_result),2)
+        else:
+            return round(statistics.median_high(result),2), round(statistics.stdev(result),2)
+    else:
+        if key == "MNLI":
+            return None, None, None, None
+        else:
+            return None, None
+
+def gather_finetune_results(result_path, extra_col=[], lr="2e-5"):
+    output = ""
+    for field in extra_col:
+        output += f"{field} &"
+    task_output = ""
+    median_list, std_list = [], []
+    m_median, m_std, mm_median, mm_std = gather_GLUE_results(result_path, "MNLI", lr)
+    if m_median is not None:
+        median_list += [m_median, mm_median]
+        std_list += [m_std, mm_std]
+    task_output += f"{m_median}±{m_std} & {mm_median}±{mm_std} &"
+    tasks = ["QQP", "QNLI", "SST-2", "CoLA", "STS-B", "MRPC", "RTE"]
+    for task in tasks:
+        t_median, t_std = gather_GLUE_results(result_path, task, lr)
+        if t_median is not None:
+            median_list += [t_median]
+            std_list += [t_std]
+        if task == "RTE":
+            task_output += f"{t_median}±{t_std} "
+        else:
+            task_output += f"{t_median}±{t_std} &"
+    overall_median = round(sum(median_list) / len(median_list), 2)
+    overall_std = round(sum(std_list) / len(std_list), 2)
+    output += f"{overall_median}±{overall_std} &"
+    output += task_output
+    output += " \\\\"
+    print(output)
+
+if __name__ == '__main__':
+    print("\\begin{table}")
+    print("\centering")
+    print("\\tiny")
+    text = "\\begin{tabular}{@{}l|"
+    for _ in range(11):
+        text += "c"
+    text += "@{}}"
+    print(text)
+    print("\\toprule")
+    print("Case & Train tokens & Average & MNLI-m & MNLI-mm & QQP & QNLI & SST-2 & CoLA & STS-B & MRPC & RTE \\\\")
+    print("\midrule")
+    
+    result_path='/blob/users/conglli/project/bert_with_pile/checkpoint/bert-pile-0.336B-iters-2M-lr-1e-4-min-1e-5-wmup-10000-dcy-2M-sty-linear-gbs-1024-mbs-16-gpu-64-zero-0-mp-1-pp-1-nopp-finetune/'
+    gather_finetune_results(result_path)
+    
+    print("\\bottomrule")
+    print("\end{tabular}")
+    print("\end{table}")
+    print("")
+    print("")
\ No newline at end of file
diff --git a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
index f03c65ccf..c771a0e27 100644
--- a/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
+++ b/examples/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh
@@ -8,26 +8,35 @@
 # train_iters_in_million=2
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million}
 ###############################################################################
+## Baseline 703B tokens (67%):
+# lr=1.5e-4
+# train_iters_in_million=134e-2
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million}
+###############################################################################
+## Baseline 524B tokens (50%):
+# lr=2e-4
+# train_iters_in_million=1
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million}
+###############################################################################
 ### Curriculum learning (CL) + Random layerwise token dropping (random-LTD).
-### Due to resource constraints, we did not finish training any model with this
-### setup. This example is just to demonstrate that CL+random-LTD can run for
+### DeepSpeed Data Efficiency's composed solution.
 ### BERT pretraining.
 ## CL+random-LTD 1049B tokens (100%):
 # lr=1e-4
 # train_iters_in_million=2
 # ltd_enabled="true"
-# ltd_start=200
-# ltd_step_in_million=18e-1
-# dropout=0
+# ltd_start=128
+# ltd_step_in_million=2
+# dropout=1e-1
 # cl_enabled="true"
 # cl_num_metric=2
 # cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="value"
+# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
 # cl_1st_clustering_type="schedule_based"
-# cl_1st_min=600
-# cl_1st_max=9069
+# cl_1st_min=5
+# cl_1st_max=100
 # cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
@@ -50,46 +59,159 @@
 #     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
 #     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
 #     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
-#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root} \
+#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+## CL+random-LTD 524B tokens (50%):
+# lr=2e-4
+# train_iters_in_million=1
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step_in_million=1
+# dropout=1e-1
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=5
+# cl_1st_max=100
+# cl_1st_total_step_in_million=48e-2
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=128
+# cl_2nd_max=512
+# cl_2nd_total_step_in_million=48e-2
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
 ###############################################################################
 ### Random layerwise token dropping (random-LTD).
-## random-LTD 723B tokens (69%):
-# lr=1.45e-4
-# train_iters_in_million=138e-2
+## random-LTD 1049B tokens (100%):
+# lr=1e-4
+# train_iters_in_million=2
 # ltd_enabled="true"
-# ltd_start=200
-# ltd_step_in_million=18e-1
-# dropout=0
+# ltd_start=128
+# ltd_step_in_million=2
+# dropout=1e-1
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout}
+###############################################################################
+## random-LTD 703B tokens (67%):
+# lr=1.5e-4
+# train_iters_in_million=134e-2
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step_in_million=134e-2
+# dropout=1e-1
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout}
+###############################################################################
+## random-LTD 524B tokens (50%):
+# lr=2e-4
+# train_iters_in_million=1
+# ltd_enabled="true"
+# ltd_start=128
+# ltd_step_in_million=1
+# dropout=1e-1
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
 #     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout}
 ###############################################################################
 ### Curriculum learning (CL).
-## CL vocab rarity 734B tokens (70%):
-# lr=1.4e-4
-# train_iters_in_million=14e-1
+## CL vocab rarity + seqlen truncation 524B tokens (50%):
+# lr=2e-4
+# train_iters_in_million=1
 # ltd_enabled="false"
 # ltd_start=512
 # ltd_step_in_million=1
 # dropout=1e-1
 # cl_enabled="true"
-# cl_num_metric=1
+# cl_num_metric=2
 # cl_1st_metric="voc"
-# cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
-# cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="value"
+# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
 # cl_1st_clustering_type="schedule_based"
-# cl_1st_min=600
-# cl_1st_max=9069
-# cl_1st_total_step_in_million=7e-1
+# cl_1st_min=5
+# cl_1st_max=100
+# cl_1st_total_step_in_million=48e-2
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=128
+# cl_2nd_max=512
+# cl_2nd_total_step_in_million=48e-2
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
 # bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
 #     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
 #     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
 #     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
 #     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
 #     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
-#     ${cl_1st_root}
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
+###############################################################################
+## CL vocab rarity + seqlen truncation 703B tokens (67%):
+# lr=1.5e-4
+# train_iters_in_million=134e-2
+# ltd_enabled="false"
+# ltd_start=512
+# ltd_step_in_million=1
+# dropout=1e-1
+# cl_enabled="true"
+# cl_num_metric=2
+# cl_1st_metric="voc"
+# cl_1st_index_to_sample_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample_percentile_merged"
+# cl_1st_index_to_metric_path="/vc_data/users/conglli/code/data_efficiency/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
+# cl_1st_difficulty_type="percentile"
+# cl_1st_clustering_type="schedule_based"
+# cl_1st_min=5
+# cl_1st_max=100
+# cl_1st_total_step_in_million=64e-2
+# cl_1st_difficulty_step=1
+# cl_1st_root=2
+# cl_2nd_metric="seqlen_truncate"
+# cl_2nd_index_to_sample_path="dummy"
+# cl_2nd_index_to_metric_path="dummy"
+# cl_2nd_difficulty_type="value"
+# cl_2nd_clustering_type="single_cluster"
+# cl_2nd_min=128
+# cl_2nd_max=512
+# cl_2nd_total_step_in_million=64e-2
+# cl_2nd_difficulty_step=8
+# cl_2nd_root=1
+# bash ds_pretrain_bert_336M_base_script.sh ${lr} ${train_iters_in_million} \
+#     ${ltd_enabled} ${ltd_start} ${ltd_step_in_million} ${dropout} \
+#     ${cl_enabled} ${cl_num_metric} ${cl_1st_metric} \
+#     ${cl_1st_index_to_sample_path} ${cl_1st_index_to_metric_path} \
+#     ${cl_1st_difficulty_type} ${cl_1st_clustering_type} ${cl_1st_min} \
+#     ${cl_1st_max} ${cl_1st_total_step_in_million} ${cl_1st_difficulty_step} \
+#     ${cl_1st_root} ${cl_2nd_metric} ${cl_2nd_index_to_sample_path} \
+#     ${cl_2nd_index_to_metric_path} ${cl_2nd_difficulty_type} \
+#     ${cl_2nd_clustering_type} ${cl_2nd_min} ${cl_2nd_max} \
+#     ${cl_2nd_total_step_in_million} ${cl_2nd_difficulty_step} ${cl_2nd_root}
 ###############################################################################
 ## CL vocab rarity + seqlen truncation 1049B tokens (100%):
 # lr=1e-4
@@ -103,10 +225,10 @@
 # cl_1st_metric="voc"
 # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
 # cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="value"
+# cl_1st_difficulty_type="percentile"
 # cl_1st_clustering_type="schedule_based"
-# cl_1st_min=600
-# cl_1st_max=9069
+# cl_1st_min=5
+# cl_1st_max=100
 # cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
@@ -170,10 +292,10 @@
 # cl_1st_metric="voc"
 # cl_1st_index_to_sample_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_sample"
 # cl_1st_index_to_metric_path="/blob/users/conglli/data/analysis_pile_bert_5epoch/vocab_rarity/vocab_rarity_index_to_metric"
-# cl_1st_difficulty_type="value"
+# cl_1st_difficulty_type="percentile"
 # cl_1st_clustering_type="schedule_based"
-# cl_1st_min=600
-# cl_1st_max=9069
+# cl_1st_min=5
+# cl_1st_max=100
 # cl_1st_total_step_in_million=96e-2
 # cl_1st_difficulty_step=1
 # cl_1st_root=2
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 394261e3f..7b692b169 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -36,6 +36,7 @@
 from megatron.model.distributed import DistributedDataParallel as LocalDDP
 from megatron.model.module import Float16Module
 from deepspeed.runtime.pipe import schedule
+from deepspeed.accelerator import get_accelerator
 
 class EvalHarnessAdaptor(GPT2LM):
     def __init__(self, model, tokenizer):
@@ -330,7 +331,7 @@ def load_ds_checkpoint_and_setup_megatron(extra_args_provider):
     cp_args = ds_checkpoint.get_args()
     # Merge the current args with the checkpoint args.
     skip_keys = ['world_size', 'rank', 'local_rank','device_count', 'micro_batch_size','global_batch_size', 'batch_size', 'tensorboard_dir', 'deepspeed', 'deepspeed_config',
-                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'moe_expert_parallel_size', 'moe_token_dropping', 'load', 'rampup_batch_size', 'iteration', 'inference']
+                     'data_parallel_size', 'pipeline_model_parallel_size', 'tensor_model_parallel_size', 'moe_expert_parallel_size', 'moe_token_dropping', 'load', 'rampup_batch_size', 'iteration', 'inference', 'random_ltd']
 
     skip_if_specified = ['merge_file', 'vocab_file']
 
diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py
index 6c15732c6..da7653929 100644
--- a/tasks/eval_utils.py
+++ b/tasks/eval_utils.py
@@ -64,7 +64,9 @@ def metrics_func(model, epoch, output_predictions=False):
             correct += correct_ans
             total += total_count
         if is_last_rank():
-            percent = float(correct) * 100.0 / float(total)
+            percent = 0
+            if total > 0:
+                percent = float(correct) * 100.0 / float(total)
             print(' >> |epoch: {}| overall: correct / total = {} / {} = '
                   '{:.4f} %'.format(epoch, correct, total, percent))
 
@@ -102,6 +104,7 @@ def calculate_correct_answers(name, model, dataloader,
     num_micro_batches = args.orig_global_batch_size // micro_batch_size_times_data_parallel
 
     def loss_func(output_predictions, labels, output_tensor):
+        args = get_args()
         logits = output_tensor
 
         loss_dict = {}
@@ -113,11 +116,20 @@ def loss_func(output_predictions, labels, output_tensor):
             loss_dict['labels'] = labels.data.cpu().numpy().tolist()
             loss_dict['ids'] = batch['uid'].cpu().numpy().tolist()
         # Compute the correct answers.
-        predicted = torch.argmax(logits, dim=-1)
-        corrects = (predicted == labels)
-        # Add to the counters.
-        loss_dict['total'] = labels.size(0)
-        loss_dict['correct'] = corrects.sum().item()
+        if args.finetune and args.task == 'CoLA':
+            predicted = torch.argmax(logits, dim=-1)
+            loss_dict['labels'] = labels.data.cpu().numpy().tolist()
+            loss_dict['predicted'] = predicted.data.cpu().numpy().tolist()
+        elif args.finetune and args.task == 'STS-B':
+            predicted = torch.squeeze(logits)
+            loss_dict['labels'] = labels.data.cpu().numpy().tolist()
+            loss_dict['predicted'] = predicted.data.cpu().numpy().tolist()
+        else:
+            predicted = torch.argmax(logits, dim=-1)
+            corrects = (predicted == labels)
+            # Add to the counters.
+            loss_dict['total'] = labels.size(0)
+            loss_dict['correct'] = corrects.sum().item()
 
         return 0, loss_dict
 
@@ -139,6 +151,8 @@ def correct_answers_forward_step(batch, model):
         # For all the batches in the dataset.
         total = 0
         correct = 0
+        labels = []
+        predicted = []
         if output_predictions:
             # This option is only possible when data parallel size is 1.
             assert mpu.get_data_parallel_world_size() == 1
@@ -162,8 +176,12 @@ def correct_answers_forward_step(batch, model):
                     softmaxes.extend(loss_dict['softmaxes'])
                     labels.extend(loss_dict['labels'])
                     ids.extend(loss_dict['ids'])
-                total += loss_dict['total']
-                correct += loss_dict['correct']
+                if args.finetune and args.task in ['CoLA', 'STS-B']:
+                    labels.extend(loss_dict['labels'])
+                    predicted.extend(loss_dict['predicted'])
+                else:
+                    total += loss_dict['total']
+                    correct += loss_dict['correct']
 
 
     for m in model:
@@ -173,24 +191,70 @@ def correct_answers_forward_step(batch, model):
 
     # Reduce.
     if mpu.is_pipeline_last_stage():
-        unreduced = get_accelerator().LongTensor([correct, total])
-        torch.distributed.all_reduce(unreduced,
-                                     group=mpu.get_data_parallel_group())
+        if args.finetune and args.task in ['CoLA', 'STS-B']:
+            if args.task == 'CoLA':
+                labels = get_accelerator().LongTensor(labels)
+                predicted = get_accelerator().LongTensor(predicted)
+                labels_gather = [torch.zeros(len(labels), dtype=torch.long,
+                    device=labels.device) for _ in range(mpu.get_data_parallel_world_size())]
+                predicted_gather = [torch.zeros(len(predicted), dtype=torch.long,
+                    device=predicted.device) for _ in range(mpu.get_data_parallel_world_size())]
+            else:
+                labels = get_accelerator().FloatTensor(labels)
+                predicted = get_accelerator().FloatTensor(predicted)
+                labels_gather = [torch.zeros(len(labels), dtype=torch.float,
+                    device=labels.device) for _ in range(mpu.get_data_parallel_world_size())]
+                predicted_gather = [torch.zeros(len(predicted), dtype=torch.float,
+                    device=predicted.device) for _ in range(mpu.get_data_parallel_world_size())]
+            torch.distributed.all_gather(labels_gather, labels,
+                group=mpu.get_data_parallel_group())
+            torch.distributed.all_gather(predicted_gather, predicted,
+                group=mpu.get_data_parallel_group())
 
-        # Print on screen.
+            labels_gather = sum([x.data.cpu().numpy().tolist() for x in labels_gather], [])
+            predicted_gather = sum([x.data.cpu().numpy().tolist() for x in predicted_gather], [])
 
-        correct_ans = unreduced[0].item()
-        total_count = unreduced[1].item()
-        percent = float(correct_ans) * 100.0 / float(total_count)
-        elapsed_time = time.time() - start_time
-        print_rank_last(' > |epoch: {}| metrics for {}: correct / total '
-                        '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format(
-                            epoch, name, correct_ans, total_count,
-                            percent, elapsed_time))
+            # Print on screen.
+            if args.task == 'CoLA':
+                from sklearn.metrics import matthews_corrcoef
+                mcc = matthews_corrcoef(labels_gather, predicted_gather)
+                elapsed_time = time.time() - start_time
+                print_rank_last(' > |epoch: {}| metrics for {}: mcc '
+                                '= {} , elapsed time (sec): {:.3f}'.format(
+                                    epoch, name, mcc, elapsed_time))
+            else:
+                from scipy.stats import pearsonr, spearmanr
+                pearson_corr = pearsonr(predicted_gather, labels_gather)[0]
+                spearman_corr = spearmanr(predicted_gather, labels_gather)[0]
+                corr = (pearson_corr + spearman_corr) / 2
+                elapsed_time = time.time() - start_time
+                print_rank_last(' > |epoch: {}| metrics for {}: pearson '
+                                '= {} spearmanr = {} corr = {} elapsed time (sec): {:.3f}'.format(
+                                    epoch, name, pearson_corr, spearman_corr,
+                                    corr, elapsed_time))
 
-        if output_predictions:
-            return correct_ans, total_count, (softmaxes, labels, ids)
-        return correct_ans, total_count
+            if output_predictions:
+                return 0, 0, ()
+            return 0, 0
+        else:
+            unreduced = get_accelerator().LongTensor([correct, total])
+            torch.distributed.all_reduce(unreduced,
+                                         group=mpu.get_data_parallel_group())
+
+            # Print on screen.
+
+            correct_ans = unreduced[0].item()
+            total_count = unreduced[1].item()
+            percent = float(correct_ans) * 100.0 / float(total_count)
+            elapsed_time = time.time() - start_time
+            print_rank_last(' > |epoch: {}| metrics for {}: correct / total '
+                            '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format(
+                                epoch, name, correct_ans, total_count,
+                                percent, elapsed_time))
+
+            if output_predictions:
+                return correct_ans, total_count, (softmaxes, labels, ids)
+            return correct_ans, total_count
     if output_predictions:
         return 0, 0, ()
     return 0, 0
diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py
index fd4e79a98..c9f2daf6d 100644
--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
@@ -79,6 +79,48 @@ def _cross_entropy_forward_step(batch, model):
 
     return output_tensor, partial(cross_entropy_loss_func, labels)
 
+def process_batch_mse(batch):
+    """Process batch and produce inputs for the model."""
+    args = get_args()
+
+    tokens = batch['text'].long().to(get_accelerator().device_name()).contiguous()
+    types = batch['types'].long().to(get_accelerator().device_name()).contiguous()
+    labels = batch['label'].float().to(get_accelerator().device_name()).contiguous()
+    attention_mask = batch['padding_mask'].float().to(get_accelerator().device_name()).contiguous()
+    if args.fp16:
+        attention_mask = attention_mask.half()
+
+    return tokens, types, labels, attention_mask
+
+def mse_loss_func(labels, output_tensor):
+    logits = output_tensor
+
+    # Cross-entropy loss.
+    loss_func = torch.nn.MSELoss()
+    loss = loss_func(logits.contiguous().float().view(-1), labels.view(-1))
+
+    # Reduce loss for logging.
+    averaged_loss = average_losses_across_data_parallel_group([loss])
+
+    return loss, {'lm loss': averaged_loss[0]}
+
+def mse_forward_step(batch, model):
+    """Simple forward step with cross-entropy loss."""
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator').start()
+    try:
+        batch_ = next(batch)
+    except BaseException:
+        batch_ = batch
+    tokens, types, labels, attention_mask = process_batch_mse(batch_)
+    timers('batch-generator').stop()
+
+    # Forward model.
+    output_tensor = model(tokens, attention_mask, tokentype_ids=types)
+
+    return output_tensor, partial(mse_loss_func, labels)
 
 def build_data_loader(dataset, micro_batch_size, num_workers, drop_last):
     """Data loader. Note that batch-size is the local (per GPU) batch-size."""
diff --git a/tasks/glue/cola.py b/tasks/glue/cola.py
new file mode 100644
index 000000000..123f79533
--- /dev/null
+++ b/tasks/glue/cola.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CoLA dataset."""
+
+from megatron import print_rank_0
+from tasks.data_utils import clean_text
+from .data import GLUEAbstractDataset
+
+
+LABELS = [0, 1]
+
+
+class CoLADataset(GLUEAbstractDataset):
+
+    def __init__(self, name, datapaths, tokenizer, max_seq_length,
+                 test_label=0):
+        self.test_label = test_label
+        super().__init__('CoLA', name, datapaths,
+                         tokenizer, max_seq_length)
+
+    def process_samples_from_single_path(self, filename):
+        """"Implement abstract method."""
+        print_rank_0(' > Processing {} ...'.format(filename))
+
+        samples = []
+        total = 0
+        first = True
+        is_test = False
+        with open(filename, 'r') as f:
+            for line in f:
+                row = line.strip().split('\t')
+                if first:
+                    first = False
+                    if len(row) == 2:
+                        is_test = True
+                        print_rank_0('   reading {} and {} columns and '
+                                     'setting labels to {}'.format(
+                                         row[0].strip(), row[1].strip(),
+                                         self.test_label))
+                        continue
+
+                if is_test:
+                    assert len(row) == 2, 'expected length 2: {}'.format(row)
+                    uid = int(row[0].strip())
+                    text_a = clean_text(row[1].strip())
+                    text_b = None
+                    label = self.test_label
+                    assert len(text_a) > 0
+                else:
+                    if len(row) == 4:
+                        uid = total
+                        text_a = clean_text(row[3].strip())
+                        text_b = None
+                        label = int(row[1].strip())
+                    else:
+                        print_rank_0('***WARNING*** index error, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_a) == 0:
+                        print_rank_0('***WARNING*** zero length a, '
+                                     'skipping: {}'.format(row))
+                        continue
+                assert label in LABELS
+                assert uid >= 0
+
+                sample = {'uid': uid,
+                          'text_a': text_a,
+                          'text_b': text_b,
+                          'label': label}
+                total += 1
+                samples.append(sample)
+
+                if total % 50000 == 0:
+                    print_rank_0('  > processed {} so far ...'.format(total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(samples)))
+        return samples
diff --git a/tasks/glue/finetune.py b/tasks/glue/finetune.py
index ad1938b0c..9f57734b0 100644
--- a/tasks/glue/finetune.py
+++ b/tasks/glue/finetune.py
@@ -21,7 +21,7 @@
 from megatron import mpu
 from megatron.model.classification import Classification
 from tasks.eval_utils import accuracy_func_provider
-from tasks.finetune_utils import finetune
+from tasks.finetune_utils import finetune, mse_forward_step
 
 
 def glue_classification(num_classes, Dataset,
@@ -60,9 +60,15 @@ def single_dataset_provider(datapath):
             return Dataset(name, [datapath], tokenizer, args.seq_length)
         return accuracy_func_provider(single_dataset_provider)
 
+    args = get_args()
     """Finetune/evaluate."""
-    finetune(train_valid_datasets_provider, model_provider,
-             end_of_epoch_callback_provider=metrics_func_provider)
+    if args.task == 'STS-B':
+        finetune(train_valid_datasets_provider, model_provider,
+                forward_step=mse_forward_step,
+                end_of_epoch_callback_provider=metrics_func_provider)
+    else:
+        finetune(train_valid_datasets_provider, model_provider,
+                end_of_epoch_callback_provider=metrics_func_provider)
 
 
 def main():
@@ -85,7 +91,54 @@ def name_from_datapath(datapath):
         def name_from_datapath(datapath):
             return datapath.split('QQP')[-1].strip(
                 '.tsv').strip('/').replace('_', '-')
+    elif args.task == 'QNLI':
+
+        num_classes = 2
+        from tasks.glue.qnli import QNLIDataset as Dataset
+
+        def name_from_datapath(datapath):
+            return datapath.split('QNLI')[-1].strip(
+                '.tsv').strip('/').replace('_', '-')
+    elif args.task == 'SST-2':
+
+        num_classes = 2
+        from tasks.glue.sst2 import SST2Dataset as Dataset
+
+        def name_from_datapath(datapath):
+            return datapath.split('SST-2')[-1].strip(
+                '.tsv').strip('/').replace('_', '-')
+    elif args.task == 'CoLA':
+
+        num_classes = 2
+        from tasks.glue.cola import CoLADataset as Dataset
+
+        def name_from_datapath(datapath):
+            return datapath.split('CoLA')[-1].strip(
+                '.tsv').strip('/').replace('_', '-')
+    elif args.task == 'STS-B':
+
+        num_classes = 1
+        from tasks.glue.stsb import STSBDataset as Dataset
+
+        def name_from_datapath(datapath):
+            return datapath.split('STS-B')[-1].strip(
+                '.tsv').strip('/').replace('_', '-')
+    elif args.task == 'MRPC':
+
+        num_classes = 2
+        from tasks.glue.mrpc import MRPCDataset as Dataset
 
+        def name_from_datapath(datapath):
+            return datapath.split('MRPC')[-1].strip(
+                '.tsv').strip('/').replace('_', '-')
+    elif args.task == 'RTE':
+
+        num_classes = 2
+        from tasks.glue.rte import RTEDataset as Dataset
+
+        def name_from_datapath(datapath):
+            return datapath.split('RTE')[-1].strip(
+                '.tsv').strip('/').replace('_', '-')
     else:
         raise NotImplementedError('GLUE task {} is not implemented.'.format(
             args.task))
diff --git a/tasks/glue/mrpc.py b/tasks/glue/mrpc.py
new file mode 100644
index 000000000..8dfac8402
--- /dev/null
+++ b/tasks/glue/mrpc.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MRPC dataset."""
+
+from megatron import print_rank_0
+from tasks.data_utils import clean_text
+from .data import GLUEAbstractDataset
+
+
+LABELS = [0, 1]
+
+
+class MRPCDataset(GLUEAbstractDataset):
+
+    def __init__(self, name, datapaths, tokenizer, max_seq_length,
+                 test_label=0):
+        self.test_label = test_label
+        super().__init__('MRPC', name, datapaths,
+                         tokenizer, max_seq_length)
+
+    def process_samples_from_single_path(self, filename):
+        """"Implement abstract method."""
+        print_rank_0(' > Processing {} ...'.format(filename))
+
+        samples = []
+        total = 0
+        first = True
+        is_test = False
+        with open(filename, 'r') as f:
+            for line in f:
+                row = line.strip().split('\t')
+                if first:
+                    first = False
+                    if row[0].strip() == 'index':
+                        is_test = True
+                        print_rank_0('   reading {}, {}, and {} columns and '
+                                     'setting labels to {}'.format(
+                                         row[0].strip(), row[3].strip(),
+                                         row[4].strip(), self.test_label))
+                    else:
+                        assert len(row) == 5
+                        print_rank_0('    reading {}, {}, and {} columns'
+                                     ' ...'.format(
+                                         row[0].strip(), row[3].strip(),
+                                         row[4].strip()))
+                    continue
+
+                if is_test:
+                    assert len(row) == 5, 'expected length 5: {}'.format(row)
+                    uid = int(row[0].strip())
+                    text_a = clean_text(row[3].strip())
+                    text_b = clean_text(row[4].strip())
+                    label = self.test_label
+                    assert len(text_a) > 0
+                    assert len(text_b) > 0
+                else:
+                    if len(row) == 5:
+                        uid = total
+                        text_a = clean_text(row[3].strip())
+                        text_b = clean_text(row[4].strip())
+                        label = int(row[0].strip())
+                    else:
+                        print_rank_0('***WARNING*** index error, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_a) == 0:
+                        print_rank_0('***WARNING*** zero length a, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_b) == 0:
+                        print_rank_0('***WARNING*** zero length b, '
+                                     'skipping: {}'.format(row))
+                        continue
+                assert label in LABELS
+                assert uid >= 0
+
+                sample = {'uid': uid,
+                          'text_a': text_a,
+                          'text_b': text_b,
+                          'label': label}
+                total += 1
+                samples.append(sample)
+
+                if total % 50000 == 0:
+                    print_rank_0('  > processed {} so far ...'.format(total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(samples)))
+        return samples
diff --git a/tasks/glue/qnli.py b/tasks/glue/qnli.py
new file mode 100644
index 000000000..af0841d4a
--- /dev/null
+++ b/tasks/glue/qnli.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""QNLI dataset."""
+
+from megatron import print_rank_0
+from tasks.data_utils import clean_text
+from .data import GLUEAbstractDataset
+
+
+LABELS = {'entailment': 0, 'not_entailment': 1}
+
+
+class QNLIDataset(GLUEAbstractDataset):
+
+    def __init__(self, name, datapaths, tokenizer, max_seq_length,
+                 test_label='entailment'):
+        self.test_label = test_label
+        super().__init__('QNLI', name, datapaths,
+                         tokenizer, max_seq_length)
+
+    def process_samples_from_single_path(self, filename):
+        """"Implement abstract method."""
+        print_rank_0(' > Processing {} ...'.format(filename))
+
+        samples = []
+        total = 0
+        first = True
+        is_test = False
+        with open(filename, 'r') as f:
+            for line in f:
+                row = line.strip().split('\t')
+                if first:
+                    first = False
+                    if len(row) == 3:
+                        is_test = True
+                        print_rank_0('   reading {}, {}, and {} columns and '
+                                     'setting labels to {}'.format(
+                                         row[0].strip(), row[1].strip(),
+                                         row[2].strip(), self.test_label))
+                    else:
+                        assert len(row) == 4
+                        print_rank_0('    reading {}, {}, {}, and {} columns'
+                                     ' ...'.format(
+                                         row[0].strip(), row[1].strip(),
+                                         row[2].strip(), row[3].strip()))
+                    continue
+
+                if is_test:
+                    assert len(row) == 3, 'expected length 3: {}'.format(row)
+                    uid = int(row[0].strip())
+                    text_a = clean_text(row[1].strip())
+                    text_b = clean_text(row[2].strip())
+                    label = self.test_label
+                    assert len(text_a) > 0
+                    assert len(text_b) > 0
+                else:
+                    if len(row) == 4:
+                        uid = int(row[0].strip())
+                        text_a = clean_text(row[1].strip())
+                        text_b = clean_text(row[2].strip())
+                        label = row[-1].strip()
+                    else:
+                        print_rank_0('***WARNING*** index error, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_a) == 0:
+                        print_rank_0('***WARNING*** zero length a, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_b) == 0:
+                        print_rank_0('***WARNING*** zero length b, '
+                                     'skipping: {}'.format(row))
+                        continue
+                assert label in LABELS
+                assert uid >= 0
+
+                sample = {'uid': uid,
+                          'text_a': text_a,
+                          'text_b': text_b,
+                          'label': LABELS[label]}
+                total += 1
+                samples.append(sample)
+
+                if total % 50000 == 0:
+                    print_rank_0('  > processed {} so far ...'.format(total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(samples)))
+        return samples
diff --git a/tasks/glue/rte.py b/tasks/glue/rte.py
new file mode 100644
index 000000000..096a26ecc
--- /dev/null
+++ b/tasks/glue/rte.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RTE dataset."""
+
+from megatron import print_rank_0
+from tasks.data_utils import clean_text
+from .data import GLUEAbstractDataset
+
+
+LABELS = {'entailment': 0, 'not_entailment': 1}
+
+
+class RTEDataset(GLUEAbstractDataset):
+
+    def __init__(self, name, datapaths, tokenizer, max_seq_length,
+                 test_label='entailment'):
+        self.test_label = test_label
+        super().__init__('RTE', name, datapaths,
+                         tokenizer, max_seq_length)
+
+    def process_samples_from_single_path(self, filename):
+        """"Implement abstract method."""
+        print_rank_0(' > Processing {} ...'.format(filename))
+
+        samples = []
+        total = 0
+        first = True
+        is_test = False
+        with open(filename, 'r') as f:
+            for line in f:
+                row = line.strip().split('\t')
+                if first:
+                    first = False
+                    if len(row) == 3:
+                        is_test = True
+                        print_rank_0('   reading {}, {}, and {} columns and '
+                                     'setting labels to {}'.format(
+                                         row[0].strip(), row[1].strip(),
+                                         row[2].strip(), self.test_label))
+                    else:
+                        assert len(row) == 4
+                        print_rank_0('    reading {}, {}, {}, and {} columns'
+                                     ' ...'.format(
+                                         row[0].strip(), row[1].strip(),
+                                         row[2].strip(), row[3].strip()))
+                    continue
+
+                if is_test:
+                    assert len(row) == 3, 'expected length 3: {}'.format(row)
+                    uid = int(row[0].strip())
+                    text_a = clean_text(row[1].strip())
+                    text_b = clean_text(row[2].strip())
+                    label = self.test_label
+                    assert len(text_a) > 0
+                    assert len(text_b) > 0
+                else:
+                    if len(row) == 4:
+                        uid = int(row[0].strip())
+                        text_a = clean_text(row[1].strip())
+                        text_b = clean_text(row[2].strip())
+                        label = row[-1].strip()
+                    else:
+                        print_rank_0('***WARNING*** index error, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_a) == 0:
+                        print_rank_0('***WARNING*** zero length a, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_b) == 0:
+                        print_rank_0('***WARNING*** zero length b, '
+                                     'skipping: {}'.format(row))
+                        continue
+                assert label in LABELS
+                assert uid >= 0
+
+                sample = {'uid': uid,
+                          'text_a': text_a,
+                          'text_b': text_b,
+                          'label': LABELS[label]}
+                total += 1
+                samples.append(sample)
+
+                if total % 50000 == 0:
+                    print_rank_0('  > processed {} so far ...'.format(total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(samples)))
+        return samples
diff --git a/tasks/glue/sst2.py b/tasks/glue/sst2.py
new file mode 100644
index 000000000..966efc247
--- /dev/null
+++ b/tasks/glue/sst2.py
@@ -0,0 +1,95 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SST-2 dataset."""
+
+from megatron import print_rank_0
+from tasks.data_utils import clean_text
+from .data import GLUEAbstractDataset
+
+
+LABELS = [0, 1]
+
+
+class SST2Dataset(GLUEAbstractDataset):
+
+    def __init__(self, name, datapaths, tokenizer, max_seq_length,
+                 test_label=0):
+        self.test_label = test_label
+        super().__init__('SST-2', name, datapaths,
+                         tokenizer, max_seq_length)
+
+    def process_samples_from_single_path(self, filename):
+        """"Implement abstract method."""
+        print_rank_0(' > Processing {} ...'.format(filename))
+
+        samples = []
+        total = 0
+        first = True
+        is_test = False
+        with open(filename, 'r') as f:
+            for line in f:
+                row = line.strip().split('\t')
+                if first:
+                    first = False
+                    if row[0].strip() == 'index':
+                        is_test = True
+                        print_rank_0('   reading {} and {} columns and '
+                                     'setting labels to {}'.format(
+                                         row[0].strip(), row[1].strip(),
+                                         self.test_label))
+                    else:
+                        assert len(row) == 2
+                        print_rank_0('    reading {} and {} columns'
+                                     ' ...'.format(
+                                         row[0].strip(), row[1].strip()))
+                    continue
+
+                if is_test:
+                    assert len(row) == 2, 'expected length 2: {}'.format(row)
+                    uid = int(row[0].strip())
+                    text_a = clean_text(row[1].strip())
+                    text_b = None
+                    label = self.test_label
+                    assert len(text_a) > 0
+                else:
+                    if len(row) == 2:
+                        uid = total
+                        text_a = clean_text(row[0].strip())
+                        text_b = None
+                        label = int(row[-1].strip())
+                    else:
+                        print_rank_0('***WARNING*** index error, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_a) == 0:
+                        print_rank_0('***WARNING*** zero length a, '
+                                     'skipping: {}'.format(row))
+                        continue
+                assert label in LABELS
+                assert uid >= 0
+
+                sample = {'uid': uid,
+                          'text_a': text_a,
+                          'text_b': text_b,
+                          'label': label}
+                total += 1
+                samples.append(sample)
+
+                if total % 50000 == 0:
+                    print_rank_0('  > processed {} so far ...'.format(total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(samples)))
+        return samples
diff --git a/tasks/glue/stsb.py b/tasks/glue/stsb.py
new file mode 100644
index 000000000..692724620
--- /dev/null
+++ b/tasks/glue/stsb.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""STS-B dataset."""
+
+from megatron import print_rank_0
+from tasks.data_utils import clean_text
+from .data import GLUEAbstractDataset
+
+
+LABELS = [None]
+
+
+class STSBDataset(GLUEAbstractDataset):
+
+    def __init__(self, name, datapaths, tokenizer, max_seq_length,
+                 test_label=0.0):
+        self.test_label = test_label
+        super().__init__('STS-B', name, datapaths,
+                         tokenizer, max_seq_length)
+
+    def process_samples_from_single_path(self, filename):
+        """"Implement abstract method."""
+        print_rank_0(' > Processing {} ...'.format(filename))
+
+        samples = []
+        total = 0
+        first = True
+        is_test = False
+        with open(filename, 'r') as f:
+            for line in f:
+                row = line.strip().split('\t')
+                if first:
+                    first = False
+                    if len(row) == 9:
+                        is_test = True
+                        print_rank_0('   reading {}, {}, and {} columns and '
+                                     'setting labels to {}'.format(
+                                         row[0].strip(), row[7].strip(),
+                                         row[8].strip(), self.test_label))
+                    else:
+                        assert len(row) == 10
+                        print_rank_0('    reading {}, {}, {}, and {} columns'
+                                     ' ...'.format(
+                                         row[0].strip(), row[7].strip(),
+                                         row[8].strip(), row[-1].strip()))
+                    continue
+
+                if is_test:
+                    assert len(row) == 9, 'expected length 9: {}'.format(row)
+                    uid = int(row[0].strip())
+                    text_a = clean_text(row[7].strip())
+                    text_b = clean_text(row[8].strip())
+                    label = self.test_label
+                    assert len(text_a) > 0
+                    assert len(text_b) > 0
+                else:
+                    if len(row) == 10:
+                        uid = int(row[0].strip())
+                        text_a = clean_text(row[7].strip())
+                        text_b = clean_text(row[8].strip())
+                        label = float(row[-1].strip())
+                    else:
+                        print_rank_0('***WARNING*** index error, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_a) == 0:
+                        print_rank_0('***WARNING*** zero length a, '
+                                     'skipping: {}'.format(row))
+                        continue
+                    if len(text_b) == 0:
+                        print_rank_0('***WARNING*** zero length b, '
+                                     'skipping: {}'.format(row))
+                        continue
+                assert uid >= 0
+
+                sample = {'uid': uid,
+                          'text_a': text_a,
+                          'text_b': text_b,
+                          'label': label}
+                total += 1
+                samples.append(sample)
+
+                if total % 50000 == 0:
+                    print_rank_0('  > processed {} so far ...'.format(total))
+
+        print_rank_0(' >> processed {} samples.'.format(len(samples)))
+        return samples
diff --git a/tasks/main.py b/tasks/main.py
index f5bd5ad69..978359a82 100644
--- a/tasks/main.py
+++ b/tasks/main.py
@@ -77,7 +77,7 @@ def get_tasks_args(parser):
 
     if args.task == 'RACE':
         from race.finetune import main
-    elif args.task in ['MNLI', 'QQP']:
+    elif args.task in ['MNLI', 'QQP', 'QNLI', 'SST-2', 'CoLA', 'STS-B', 'MRPC', 'RTE']:
         from glue.finetune import main
     elif args.task in ['LAMBADA', 'WIKITEXT103']:
         from zeroshot_gpt.evaluate import main