diff --git a/train_configs/llama2_13b.toml b/train_configs/llama2_13b.toml index 280ac2ae..f3048ac4 100644 --- a/train_configs/llama2_13b.toml +++ b/train_configs/llama2_13b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama2_70b.toml b/train_configs/llama2_70b.toml index 959c270a..97b1bc71 100644 --- a/train_configs/llama2_70b.toml +++ b/train_configs/llama2_70b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 8 # 8-way TP -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama2_7b.toml b/train_configs/llama2_7b.toml index f2e66de7..95b4c496 100644 --- a/train_configs/llama2_7b.toml +++ b/train_configs/llama2_7b.toml @@ -32,11 +32,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 # dp-only would be sufficient for 7B -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama3_70b.toml b/train_configs/llama3_70b.toml index f45632ad..d498e677 100644 --- a/train_configs/llama3_70b.toml +++ b/train_configs/llama3_70b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 8 # 8-way TP -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama3_8b.toml b/train_configs/llama3_8b.toml index aaba99a2..f194addb 100644 --- a/train_configs/llama3_8b.toml +++ b/train_configs/llama3_8b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint"