diff --git a/tests/pytorch/nightly/llama2-model.libsonnet b/tests/pytorch/nightly/llama2-model.libsonnet
index 8b8efcf82..04cdb21f3 100644
--- a/tests/pytorch/nightly/llama2-model.libsonnet
+++ b/tests/pytorch/nightly/llama2-model.libsonnet
@@ -45,23 +45,15 @@ local utils = import 'templates/utils.libsonnet';
     },
     command: self.paramsOverride.trainCommand,
   },
+  local pjrt = self.pjrt,
+  pjrt:: common.PyTorchTpuVmMixin {
+    modelName: 'llama2-pjrt',
+  },
   local infer = self.infer,
-  infer:: common.PyTorchTpuVmMixin {
+  infer:: common.PyTorchTpuVmMixin + pjrt {
     modelName+: '-infer',
     tpuSettings+: {
       tpuVmExtraSetup: |||
-        pip3 uninstall torch torch_xla torchvision libtpu-nightly -y
-        sudo apt-get update -y
-        sudo apt-get install libomp5 -y
-        pip3 install mkl mkl-include
-        pip3 install tf-nightly tb-nightly tbp-nightly
-        pip3 install numpy
-        sudo apt-get install numactl -y
-        sudo apt-get install libopenblas-dev -y
-        pip3 install --user --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
-        pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl
-        pip3 install torch_xla[tpuvm]
-
         # install tokenizer model
         wget https://storage.googleapis.com/tpu-pytorch/lsiyuan-experiment/llama/spiece.model
 
@@ -93,7 +85,7 @@ local utils = import 'templates/utils.libsonnet';
     },
   },
   local spmd = self.spmd,
-  spmd:: common.PyTorchTpuVmMixin {
+  spmd:: common.PyTorchTpuVmMixin + pjrt {
     modelName+: '-train-spmd',
     tpuSettings+: {
       tpuVmExports+: |||
@@ -110,19 +102,6 @@ local utils = import 'templates/utils.libsonnet';
         export TPU_MEGACORE=megacore_dense
       |||,
       tpuVmExtraSetup: |||
-        pip3 uninstall torch torch_xla torchvision libtpu-nightly -y
-        sudo apt update -y
-        sudo apt-get update -y
-        pip install accelerate -U
-        sudo apt-get install libomp5 -y
-        pip3 install mkl mkl-include
-        pip3 install numpy
-        sudo apt-get install numactl -y
-        sudo apt-get install libopenblas-dev -y
-        pip3 install --user --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
-        pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl
-        pip3 install torch_xla[tpuvm]
-
         # install tokenizer model
         wget https://storage.googleapis.com/tpu-pytorch/lsiyuan-experiment/llama/spiece.model
 
@@ -144,10 +123,23 @@ local utils = import 'templates/utils.libsonnet';
         wget https://storage.googleapis.com/manfei_public_experimental/2B.json
 
         # save llama2 training
-        echo -e 'python transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 32 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/7B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none' >> llama2training.sh
+        echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/7B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh
+        echo -e 'import numpy as np' >> getvalue.py
+        echo -e 'file = open("output.txt")' >> getvalue.py
+        echo -e 'content = file.readlines()' >> getvalue.py
+        echo -e 'value_line = content[-1]' >> getvalue.py
+        echo -e 'value_value = float((value_line.split())[2])' >> getvalue.py
+        echo -e 'value_value = np.reciprocal(value_value)' >> getvalue.py
+        echo -e 'if value_value > 14.000 or value_value < 12.667 :' >> getvalue.py
+        echo -e '    raise ValueError("expose to train_steps_per_second exceeded throuhold 13.333 +- 5%")' >> getvalue.py
+        echo -e 'else:' >> getvalue.py
+        echo -e '    print("Finished llama2 test and warm latency/token within expected throuhold 13.333 +- 5%")' >> getvalue.py
+        echo -e 'cat output.txt' >> llama2training.sh
+        echo -e 'python3 transformers/getvalue.py' >> llama2training.sh
         cat llama2training.sh
         pwd
         ls
+
       |||,
     },
   },
diff --git a/tests/pytorch/r2.1/llama2-model.libsonnet b/tests/pytorch/r2.1/llama2-model.libsonnet
index 4e8d55182..a2b653d3d 100644
--- a/tests/pytorch/r2.1/llama2-model.libsonnet
+++ b/tests/pytorch/r2.1/llama2-model.libsonnet
@@ -222,17 +222,17 @@ local utils = import 'templates/utils.libsonnet';
 
         # save llama2 training
         cd ..
-        echo -e 'python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none > output.txt' >> llama2training.sh
+        echo -e 'XLA_USE_BF16=1 python3 transformers/examples/pytorch/language-modeling/run_clm.py --tokenizer_name hf-internal-testing/llama-tokenizer --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 256 --per_device_eval_batch_size 8 --num_train_epochs 1 --do_train --output_dir /tmp/output --overwrite_output_dir --config_name transformers/2B/2B.json --save_strategy no --logging_strategy no --remove_unused_columns no --spmd_fsdp_sharding --torch_dtype bfloat16 --dataloader_drop_last yes --spmd_grad_chkpt --report_to none --optim adafactor > output.txt' >> llama2training.sh
         echo -e 'import numpy as np' >> getvalue.py
         echo -e 'file = open("output.txt")' >> getvalue.py
         echo -e 'content = file.readlines()' >> getvalue.py
         echo -e 'value_line = content[-1]' >> getvalue.py
         echo -e 'value_value = float((value_line.split())[2])' >> getvalue.py
         echo -e 'value_value = np.reciprocal(value_value)' >> getvalue.py
-        echo -e 'if value_value > 6.863 or value_value < 6.209 :' >> getvalue.py
-        echo -e '    raise ValueError("expose to train_steps_per_second exceeded throuhold 6.536 +- 5%")' >> getvalue.py
+        echo -e 'if value_value > 14.000 or value_value < 12.667 :' >> getvalue.py
+        echo -e '    raise ValueError("expose to train_steps_per_second exceeded throuhold 13.333 +- 5%")' >> getvalue.py
         echo -e 'else:' >> getvalue.py
-        echo -e '    print("Finished llama2 test and warm latency/token within expected throuhold 6.536 +- 5%")' >> getvalue.py
+        echo -e '    print("Finished llama2 test and warm latency/token within expected throuhold 13.333 +- 5%")' >> getvalue.py
         echo -e 'cat output.txt' >> llama2training.sh
         echo -e 'python3 transformers/getvalue.py' >> llama2training.sh
         cat llama2training.sh
@@ -383,7 +383,7 @@ local utils = import 'templates/utils.libsonnet';
     llama2_inference + v4_8 + common.Functional + timeouts.Hours(3) + infer7B,
     llama2_inference + v4_8 + common.Functional + timeouts.Hours(3) + infer70B,
     llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B,
-    llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B128,
+    llama2_training + v4_8 + common.Functional + timeouts.Hours(3) + spmd2B256,
     llama2_training + convergence + v4_8 + common.Functional + timeouts.Hours(3) + spmd2Bconv,
   ],
 }