diff --git a/train_configs/llama2_13b.toml b/train_configs/llama2_13b.toml index 280ac2ae8..f3048ac48 100644 --- a/train_configs/llama2_13b.toml +++ b/train_configs/llama2_13b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama2_70b.toml b/train_configs/llama2_70b.toml index 959c270ad..97b1bc717 100644 --- a/train_configs/llama2_70b.toml +++ b/train_configs/llama2_70b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 8 # 8-way TP -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama2_7b.toml b/train_configs/llama2_7b.toml index f2e66de75..95b4c4962 100644 --- a/train_configs/llama2_7b.toml +++ b/train_configs/llama2_7b.toml @@ -32,11 +32,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 # dp-only would be sufficient for 7B -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama3_70b.toml b/train_configs/llama3_70b.toml index f45632ad4..d498e677a 100644 --- a/train_configs/llama3_70b.toml +++ b/train_configs/llama3_70b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 8 # 8-way TP -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama3_8b.toml b/train_configs/llama3_8b.toml index aaba99a21..f194addb9 100644 --- a/train_configs/llama3_8b.toml +++ b/train_configs/llama3_8b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint"