codefuse-ai
diff --git a/‎mftcoder_accelerate/src/data/blendable_dataset.py
+3-5 b/‎mftcoder_accelerate/src/data/blendable_dataset.py
+3-5
diff --git a/‎mftcoder_accelerate/src/data/data_utils.py
+23-35 b/‎mftcoder_accelerate/src/data/data_utils.py
+23-35
diff --git a/‎mftcoder_accelerate/src/data/gpt2_dataset.py
+16-39 b/‎mftcoder_accelerate/src/data/gpt2_dataset.py
+16-39
@@ -43,7 +43,7 @@ def __init__(self, datasets, weights):
 
         # recompute weights
         weights = self.calc_weights()
-        
+
         # Build indices.
         start_time = time.time()
         assert num_datasets < 255
@@ -63,17 +63,15 @@ def __init__(self, datasets, weights):
 
         print(
             "> RANK {} elapsed time for building blendable dataset indices: "
-            "{:.2f} (sec)".format(
-                torch.distributed.get_rank(), time.time() - start_time
-            )
+            "{:.2f} (sec)".format(torch.distributed.get_rank(), time.time() - start_time)
         )
 
     def calc_weights(self):
         dataset_sample_cnt = [len(ds) for ds in self.datasets]
         total_cnt = sum(dataset_sample_cnt)
         weights = np.array([(cnt + 0.0) / total_cnt for cnt in dataset_sample_cnt], dtype=np.float64)
         return weights
-    
+
     def __len__(self):
         return self.size
 
 
@@ -32,10 +32,7 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
 
     start_time = time.time()
     indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
-    print_rank_0(
-        " > finished creating indexed dataset in {:4f} "
-        "seconds".format(time.time() - start_time)
-    )
+    print_rank_0(" > finished creating indexed dataset in {:4f} " "seconds".format(time.time() - start_time))
     print_rank_0("    number of documents: {}".format(indexed_dataset.sizes.shape[0]))
 
     return indexed_dataset
@@ -53,20 +50,22 @@ def build_train_valid_test_datasets(
     build_index_mappings=True,
     shuffle_before_split=False,
     weighted_loss_mode=None,
-    ds_weights=[1., 1., 1.],
-    train_mode='sft',
+    ds_weights=[1.0, 1.0, 1.0],
+    train_mode="sft",
 ):
     """Build train, valid, and test datasets."""
 
     # Indexed dataset.
-    assert os.path.exists(data_prefix + "_input_ids.bin"), f"Input tokens datafile not found: {data_prefix}_input_ids.bin"
+    assert os.path.exists(
+        data_prefix + "_input_ids.bin"
+    ), f"Input tokens datafile not found: {data_prefix}_input_ids.bin"
 
     # Indexed dataset.
     input_ids_indexed_dataset = get_indexed_dataset_(data_prefix + "_input_ids", data_impl, skip_warmup)
-    if train_mode == 'sft':
+    if train_mode == "sft":
         loss_mask_indexed_dataset = get_indexed_dataset_(data_prefix + "_loss_mask", data_impl, skip_warmup)
     else:
-        print(f'pretrain mode, loss mask is ones')
+        print(f"pretrain mode, loss mask is ones")
         loss_mask_indexed_dataset = None
 
     total_num_of_documents = input_ids_indexed_dataset.sizes.shape[0]
@@ -79,9 +78,7 @@ def print_split_stats(name, index):
         print_rank_0("    {}:".format(name))
         print_rank_0(
             "     document indices in [{}, {}) total of {} "
-            "documents".format(
-                splits[index], splits[index + 1], splits[index + 1] - splits[index]
-            )
+            "documents".format(splits[index], splits[index + 1], splits[index + 1] - splits[index])
         )
 
     print_split_stats("train", 0)
@@ -100,11 +97,9 @@ def build_dataset(index, name, ds_weight=1.0):
         dataset = None
         if splits[index + 1] > splits[index]:
             if shuffle_before_split:
-                documents = shuffle_doc_index[splits[index]:splits[index + 1]]
+                documents = shuffle_doc_index[splits[index] : splits[index + 1]]
             else:
-                documents = np.arange(
-                    start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
-                )
+                documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32)
 
             dataset = GPT2PromptDataset(
                 name,
@@ -130,11 +125,13 @@ def build_dataset(index, name, ds_weight=1.0):
     return train_dataset, valid_dataset, test_dataset, total_num_of_documents
 
 
-def build_multiple_train_valid_test_datasets(args, train_valid_test_num_samples, use_shared_fs=True, data_impl="mmap", mmap_warmup=False):
+def build_multiple_train_valid_test_datasets(
+    args, train_valid_test_num_samples, use_shared_fs=True, data_impl="mmap", mmap_warmup=False
+):
     """Build multiple train, valid, and test datasets."""
-    data_prefixes = list(args.data_paths[1:-1].split(','))
+    data_prefixes = list(args.data_paths[1:-1].split(","))
 
-    data_weights = list(map(float, args.data_weights[1:-1].split(',')))
+    data_weights = list(map(float, args.data_weights[1:-1].split(",")))
     print("data weights: ")
     print(data_weights)
     use_shared_fs = use_shared_fs
@@ -143,7 +140,7 @@ def build_multiple_train_valid_test_datasets(args, train_valid_test_num_samples,
     seq_length = args.seq_length
     # seq_length = args.block_size
     seed = args.seed
-    skip_warmup = (not mmap_warmup)
+    skip_warmup = not mmap_warmup
     weight_by_num_documents = args.weight_by_num_documents
     shuffle_before_split = args.shuffle_before_split
     weighted_loss_mode = args.weighted_loss_mode
@@ -183,9 +180,7 @@ def build_multiple_train_valid_test_datasets(args, train_valid_test_num_samples,
     factor = 1
     if weight_by_num_documents:
         # gets the number of documents in each data path
-        get_num_docs_list = lambda datasets: [
-            dataset.input_ids_indexed_dataset.sizes.shape[0] for dataset in datasets
-        ]
+        get_num_docs_list = lambda datasets: [dataset.input_ids_indexed_dataset.sizes.shape[0] for dataset in datasets]
         train_num_docs, valid_num_docs, test_num_docs = (
             get_num_docs_list(train_datasets),
             get_num_docs_list(valid_datasets),
@@ -201,7 +196,7 @@ def build_multiple_train_valid_test_datasets(args, train_valid_test_num_samples,
         )
         assert sum(train_weights) != 0.0, "found train weights to be 0.0"
         assert sum(valid_weights) != 0.0, "found valid weights to be 0.0"
-        
+
         train_weights, train_num_samples = get_normalized_weights_and_num_samples(
             train_weights, train_valid_test_num_samples[0]
         )
@@ -265,7 +260,7 @@ def build_multiple_train_valid_test_datasets(args, train_valid_test_num_samples,
         if num_tokens:
             factor = sum(num_tokens) / (sum(total_sample_cnt) * args.seq_length)
             factor /= sum([1.0 / w for w in train_ds_weights]) / len(train_ds_weights)
-            
+
     print_rank_0(f"> common denomination factor for CE loss: {factor}")
 
     # Blend.
@@ -274,7 +269,7 @@ def build_multiple_train_valid_test_datasets(args, train_valid_test_num_samples,
         i = 0
         for ds in train_datasets:
             ds.update_ds_weight(ds.ds_weight / factor)
-            print(f'loss weight of dataset {i} after update: {ds.ds_weight}')
+            print(f"loss weight of dataset {i} after update: {ds.ds_weight}")
             i += 1
         blending_train_dataset = BlendableDataset(train_datasets, train_weights)
     blending_valid_dataset = None
@@ -318,9 +313,7 @@ def get_train_valid_test_split_(splits_string, size):
     return splits_index
 
 
-def get_normalized_weights_and_num_samples(
-    weights: List[float], num_samples: int
-) -> Tuple[List[float], List[int]]:
+def get_normalized_weights_and_num_samples(weights: List[float], num_samples: int) -> Tuple[List[float], List[int]]:
     # Normalize weights
     weight_sum = sum(weights)
     assert weight_sum > 0.0
@@ -346,12 +339,7 @@ def get_datasets_normalized_weights_and_num_samples(
     # samples left to feed to the network.
     weighted_num_samples = []
     for weight in weights:
-        weighted_num_samples.append(
-            [
-                int(math.ceil(val * weight * 1.005))
-                for val in num_samples
-            ]
-        )
+        weighted_num_samples.append([int(math.ceil(val * weight * 1.005)) for val in num_samples])
     return weights, weighted_num_samples
 
 
 
@@ -41,7 +41,7 @@ def __init__(
         use_shared_fs=True,
         weighted_loss_mode=None,
         ds_weight=1.0,
-        train_mode='sft',
+        train_mode="sft",
     ):
 
         self.name = name
@@ -50,9 +50,9 @@ def __init__(
 
         self.weighted_loss_mode = weighted_loss_mode
         self.ds_weight = ds_weight
-        
-        self.task_name = data_prefix.split('/')[-1]
-        
+
+        self.task_name = data_prefix.split("/")[-1]
+
         self.task_id = TASK2ID[self.task_name]
 
         # Checks
@@ -114,14 +114,10 @@ def __getitem__(self, idx):
 
             else:
                 # Otherwise, get the rest of the initial document.
-                input_ids_list = [
-                    self.input_ids_indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
-                ]
+                input_ids_list = [self.input_ids_indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)]
 
                 if self.loss_mask_indexed_dataset is not None:
-                    loss_mask_list = [
-                        self.loss_mask_indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
-                    ]
+                    loss_mask_list = [self.loss_mask_indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)]
                 else:
                     loss_mask_list = []
 
@@ -133,16 +129,12 @@ def __getitem__(self, idx):
 
                 # And finally add the relevant portion of last document.
                 input_ids_list.append(
-                    self.input_ids_indexed_dataset.get(
-                        self.doc_idx[doc_index_l], length=offset_l + 1
-                    )
+                    self.input_ids_indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
                 )
 
                 if self.loss_mask_indexed_dataset is not None:
                     loss_mask_list.append(
-                        self.loss_mask_indexed_dataset.get(
-                            self.doc_idx[doc_index_l], length=offset_l + 1
-                        )
+                        self.loss_mask_indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
                     )
 
                 input_ids = np.concatenate(input_ids_list)
@@ -246,18 +238,12 @@ def __getitem__(self, idx):
                 )
             else:
                 # Otherwise, get the rest of the initial document.
-                sample_list = [
-                    self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
-                ]
+                sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)]
                 # Loop over all in between documents and add the entire document.
                 for i in range(doc_index_f + 1, doc_index_l):
                     sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
                 # And finally add the relevant portion of last document.
-                sample_list.append(
-                    self.indexed_dataset.get(
-                        self.doc_idx[doc_index_l], length=offset_l + 1
-                    )
-                )
+                sample_list.append(self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1))
                 sample = np.concatenate(sample_list)
 
             return {"text": np.array(sample, dtype=np.int64)}
@@ -313,10 +299,7 @@ def _build_index_mappings(
             or (not os.path.isfile(sample_idx_filename))
             or (not os.path.isfile(shuffle_idx_filename))
         ):
-            print_rank_0(
-                " > WARNING: could not find index map files, building "
-                "the indices on rank 0 ..."
-            )
+            print_rank_0(" > WARNING: could not find index map files, building " "the indices on rank 0 ...")
             # doc-idx.
             start_time = time.time()
             doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
@@ -338,13 +321,9 @@ def _build_index_mappings(
             # 我理解这里的num_samples应该是和入参的num_samples重名，这里只是为了计算构建所有索引的长度，从而决定是用int64还是int32
             num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length
             if 2 * (num_samples + 1) < np.iinfo(np.int32).max:
-                sample_idx = helpers.build_sample_idx_int32(
-                    sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
-                )
+                sample_idx = helpers.build_sample_idx_int32(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch)
             else:
-                sample_idx = helpers.build_sample_idx_int64(
-                    sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
-                )
+                sample_idx = helpers.build_sample_idx_int64(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch)
             np.save(sample_idx_filename, sample_idx, allow_pickle=True)
             print_rank_0(
                 " > elapsed time to build and save sample-idx mapping "
@@ -360,7 +339,7 @@ def _build_index_mappings(
                 " > elapsed time to build and save shuffle-idx mapping"
                 " (seconds): {:4f}".format(time.time() - start_time)
             )
-    
+
     torch.distributed.barrier()  # TODO: model parallel
 
     # This should be a barrier but nccl barrier assumes
@@ -370,7 +349,7 @@ def _build_index_mappings(
     # torch.distributed.all_reduce(counts)
     # torch.distributed.all_reduce(counts, group=mpu.get_io_parallel_group())
     # assert counts[0].item() == torch.distributed.get_world_size(
-        # group=mpu.get_io_parallel_group()
+    # group=mpu.get_io_parallel_group()
     # )
 
     # Load mappings.
@@ -381,9 +360,7 @@ def _build_index_mappings(
     sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode="r")
     print_rank_0(" > loading shuffle-idx mapping from {}".format(shuffle_idx_filename))
     shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode="r")
-    print_rank_0(
-        "    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time)
-    )
+    print_rank_0("    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time))
     print_rank_0("    total number of samples: {}".format(sample_idx.shape[0]))
     print_rank_0("    total number of epochs: {}".format(num_epochs))