From 215beeef9f1e380d035647ffa3d01ceabeff2982 Mon Sep 17 00:00:00 2001 From: "Blok, Pieter" Date: Sat, 29 Jan 2022 12:54:35 +0900 Subject: [PATCH] option to do a comparative experiment on the same initial dataset with the same initial model --- MISC_SETTINGS.md | 4 +- README.md | 2 +- .../config/maskAL_experiment_random.yaml | 63 ++++++++++++++ .../config/maskAL_experiment_uncertainty.yaml | 63 ++++++++++++++ active_learning/sampling/prepare_dataset.py | 42 +++++++++- maskAL.py | 83 +++++++++---------- maskAL.yaml | 8 +- types.yaml | 8 +- 8 files changed, 222 insertions(+), 51 deletions(-) create mode 100644 active_learning/config/maskAL_experiment_random.yaml create mode 100644 active_learning/config/maskAL_experiment_uncertainty.yaml diff --git a/MISC_SETTINGS.md b/MISC_SETTINGS.md index 3913974..84d5f77 100644 --- a/MISC_SETTINGS.md +++ b/MISC_SETTINGS.md @@ -2,6 +2,8 @@ The following settings can probably stay unchanged:

| Setting | Description | | --------------------------------------|-----------------------------------------------------------------------------------------------------------------------| +| duplicate_initial_model_and_data | Experimental mode: set this to **True** when you want to duplicate a previously trained model and dataset with a new settings-file. Default **False** | +| initial_train_file | When **duplicate_initial_model_and_data** set to True, then specify the txt-file with the initial dataset. | | transfer_learning_on_previous_models | Whether to use the weight-files of the previous trainings for transfer-learning | | warmup_iterations | The number of warmup-iterations that can be used to stabilize the training process | | train_iterations_base | The number of training iterations to start the training with (this number of training iterations is used when the total number of training images is below the value of **step_image_number**) | @@ -19,7 +21,7 @@ The following settings can probably stay unchanged:

| minority_classes | Only when the **"RepeatFactorTrainingSampler"** is used: specify the minority-classes that have to be repeated | | repeat_factor_smallest_class | Only when the **"RepeatFactorTrainingSampler"** is used: specify the repeat-factor of the smallest class (use a value higher than 1.0 to repeat the minority classes) | | experiment_name | Specify the name of your experiment | -| strategies | Use **'uncertainty'** to select the most uncertain images for the active learning. Other options are **'random'** and **'certainty'** | +| strategy | Use **'uncertainty'** to select the most uncertain images for the active learning. Other options are **'random'** and **'certainty'** | | mode | Uncertainty sampling method. Use **'mean'** when you want to sample the most uncertain images, use **'min'** when you want to sample the most uncertain instances | | equal_pool_size | When **True** this will sample the same **pool_size** for every sampling iteration. When **False**, an unequal **pool_size** will be sampled for the specified number of loops | | dropout_probability | Specify the dropout probability between 0.1 and 0.9. Our experiments indicated that **0.25** is a good value | diff --git a/README.md b/README.md index 2169678..888511a 100644 --- a/README.md +++ b/README.md @@ -51,11 +51,11 @@ Change the following settings in the maskAL.yaml file:
| weightsroot | The file directory where the weight-files are stored | | resultsroot | The file directory where the result-files are stored | | dataroot | The root directory where all image-files are stored | -| use_initial_train_dir | Set this to **True** when you want to start the active-learning from an initial training dataset. When **False**, the initial dataset of size **initial_datasize** is randomly sampled from the **traindir** | | initial_train_dir | When use_initial_train_dir is activated: the file directory where the initial training images and annotations are stored | | traindir | The file directory where the training images and annotations are stored | | valdir | The file directory where the validation images and annotations are stored | | testdir | The file directory where the test images and annotations are stored | +| use_initial_train_dir | Set this to **True** when you want to start the active-learning from an initial training dataset. When **False**, the initial dataset of size **initial_datasize** is randomly sampled from the **traindir** | | network_config | The Mask R-CNN configuration-file (.yaml) file (see the folder './configs') | | pretrained_weights | The pretrained weights to start the active-learning. Either specify the **network_config** (.yaml) or a custom weights-file (.pth or .pkl)| | cuda_visible_devices | The identifiers of the CUDA device(s) you want to use for training and sampling (in string format, for example: '0,1') | diff --git a/active_learning/config/maskAL_experiment_random.yaml b/active_learning/config/maskAL_experiment_random.yaml new file mode 100644 index 0000000..6362981 --- /dev/null +++ b/active_learning/config/maskAL_experiment_random.yaml @@ -0,0 +1,63 @@ +# Parameters for maskAL +# python maskAL.py --config maskAL.yaml +# explanation can be found here: https://git.wur.nl/blok012/maskAL + +# folders +weightsroot: "./weights" +resultsroot: "./results" +dataroot: "./datasets/experiment/random" +initial_train_dir: "./datasets" +traindir: "./datasets/train" +valdir: "./datasets/val" +testdir: "./datasets/test03" + +# data options +use_initial_train_dir: False +duplicate_initial_model_and_data: True +initial_train_file: "./results/experiment/uncertainty/trainfiles_iteration000.txt" + +# network parameters +network_config: COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml +pretrained_weights: "./weights/experiment/uncertainty/best_model_000.pth" + +# training-parameters +cuda_visible_devices: '1' +classes: ['healthy', 'damaged', 'matured', 'cateye', 'headrot'] +transfer_learning_on_previous_models: True +learning_rate: 0.005 +warmup_iterations: 500 +train_iterations_base: 2500 +train_iterations_step_size: 2500 +step_image_number: 500 +step_ratios: [0.5, 0.8] +eval_period: 500 +checkpoint_period: -1 +weight_decay: 0.0001 +learning_policy: 'steps_with_decay' +gamma: 0.1 +train_batch_size: 1 +num_workers: 2 + +# train-sampler +train_sampler: "RepeatFactorTrainingSampler" +minority_classes: ['damaged', 'matured', 'cateye', 'headrot'] +repeat_factor_smallest_class: 1.5 + +# evaluation-parameters +confidence_threshold: 0.5 +nms_threshold: 0.01 + +# active-learning sampling +experiment_name: 'experiment' +strategy: 'random' +mode: 'mean' +initial_datasize: 100 +pool_size: 200 +equal_pool_size: True +loops: 5 +dropout_probability: 0.25 +mcd_iterations: 20 +iou_thres: 0.5 +auto_annotate: False +export_format: labelme +supervisely_meta_json: "./datasets/meta.json" diff --git a/active_learning/config/maskAL_experiment_uncertainty.yaml b/active_learning/config/maskAL_experiment_uncertainty.yaml new file mode 100644 index 0000000..ce195ff --- /dev/null +++ b/active_learning/config/maskAL_experiment_uncertainty.yaml @@ -0,0 +1,63 @@ +# Parameters for maskAL +# python maskAL.py --config maskAL.yaml +# explanation can be found here: https://git.wur.nl/blok012/maskAL + +# folders +weightsroot: "./weights" +resultsroot: "./results" +dataroot: "./datasets/experiment/uncertainty" +initial_train_dir: "./datasets" +traindir: "./datasets/train" +valdir: "./datasets/val" +testdir: "./datasets/test03" + +# data options +use_initial_train_dir: False +duplicate_initial_model_and_data: False +initial_train_file: "" + +# network parameters +network_config: COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml +pretrained_weights: COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml + +# training-parameters +cuda_visible_devices: '0' +classes: ['healthy', 'damaged', 'matured', 'cateye', 'headrot'] +transfer_learning_on_previous_models: True +learning_rate: 0.005 +warmup_iterations: 500 +train_iterations_base: 2500 +train_iterations_step_size: 2500 +step_image_number: 500 +step_ratios: [0.5, 0.8] +eval_period: 500 +checkpoint_period: -1 +weight_decay: 0.0001 +learning_policy: 'steps_with_decay' +gamma: 0.1 +train_batch_size: 1 +num_workers: 2 + +# train-sampler +train_sampler: "RepeatFactorTrainingSampler" +minority_classes: ['damaged', 'matured', 'cateye', 'headrot'] +repeat_factor_smallest_class: 1.5 + +# evaluation-parameters +confidence_threshold: 0.5 +nms_threshold: 0.01 + +# active-learning sampling +experiment_name: 'experiment' +strategy: 'uncertainty' +mode: 'mean' +initial_datasize: 100 +pool_size: 200 +equal_pool_size: True +loops: 5 +dropout_probability: 0.25 +mcd_iterations: 20 +iou_thres: 0.5 +auto_annotate: False +export_format: labelme +supervisely_meta_json: "./datasets/meta.json" diff --git a/active_learning/sampling/prepare_dataset.py b/active_learning/sampling/prepare_dataset.py index cf99a45..d9d039b 100644 --- a/active_learning/sampling/prepare_dataset.py +++ b/active_learning/sampling/prepare_dataset.py @@ -1,7 +1,7 @@ # @Author: Pieter Blok # @Date: 2021-03-26 14:30:31 # @Last Modified by: Pieter Blok -# @Last Modified time: 2021-12-09 13:55:46 +# @Last Modified time: 2022-01-27 15:00:20 import sys import io @@ -1394,6 +1394,20 @@ def calculate_iterations(config, dataset_dicts_train): max_iterations = config['train_iterations_base'] + ((div_factor - 1) * config['train_iterations_step_size']) steps = [int(s * max_iterations) for s in config['step_ratios']] return int(max_iterations), steps + + +def read_train_file(txt_file): + initial_train_files = [] + txtfile = open(txt_file, 'r') + lines = txtfile.readlines() + + for line in lines: + if line != "\n": + full_line = line.strip() + train_file = full_line.split(",")[0] + initial_train_files.append(train_file) + txtfile.close() + return initial_train_files def prepare_all_dataset_randomly(rootdir, imgdir, classes, train_val_test_split, initial_datasize): @@ -1477,7 +1491,31 @@ def prepare_initial_dataset(config): except Exception as e: print_exception(e) logger.error("Cannot create initial-datasets") - sys.exit("Closing application") + sys.exit("Closing application") + + +def prepare_initial_dataset_from_list(config, train_list): + try: + for i, (imgdir, name) in enumerate(zip([config['traindir'], config['valdir'], config['testdir']], ['train', 'val', 'test'])): + print("") + print("Processing {:s}-dataset: {:s}".format(name, imgdir)) + rename_xml_files(imgdir) + images, annotations = list_files(imgdir) + print("{:d} images found!".format(len(images))) + print("{:d} annotations found!".format(len(annotations))) + write_file(config['dataroot'], images, name) + + if name == "train": + check_json_presence(config, imgdir, train_list, name) + create_json(config['dataroot'], imgdir, train_list, config['classes'], name) + else: + check_json_presence(config, imgdir, images, name) + create_json(config['dataroot'], imgdir, images, config['classes'], name) + + except Exception as e: + print_exception(e) + logger.error("Cannot create initial-datasets") + sys.exit("Closing application") def update_train_dataset(config, cfg, train_list): diff --git a/maskAL.py b/maskAL.py index 6b517d6..dfe8c78 100644 --- a/maskAL.py +++ b/maskAL.py @@ -1,7 +1,7 @@ # @Author: Pieter Blok # @Date: 2021-03-25 18:48:22 # @Last Modified by: Pieter Blok -# @Last Modified time: 2021-11-22 16:11:35 +# @Last Modified time: 2022-01-27 18:00:39 ## Active learning with Mask R-CNN @@ -45,7 +45,7 @@ ## libraries that are specific for dropout training from active_learning.strategies.dropout import FastRCNNConvFCHeadDropout, FastRCNNOutputLayersDropout, MaskRCNNConvUpsampleHeadDropout, Res5ROIHeadsDropout -from active_learning.sampling import observations, prepare_initial_dataset, prepare_initial_dataset_randomly, update_train_dataset, prepare_complete_dataset, calculate_repeat_threshold, calculate_iterations +from active_learning.sampling import observations, prepare_initial_dataset, prepare_initial_dataset_from_list, prepare_initial_dataset_randomly, update_train_dataset, prepare_complete_dataset, calculate_repeat_threshold, calculate_iterations, read_train_file from active_learning.sampling.montecarlo_dropout import MonteCarloDropout, MonteCarloDropoutHead from active_learning.heuristics import uncertainty @@ -105,7 +105,7 @@ def check_pretrained_weights(field, value, error): schema[key] = {'type': value1, 'allowed': ['steps_with_lrs', 'steps_with_decay', 'step', 'cosine_decay', 'exp_decay']} elif key == "train_sampler": schema[key] = {'type': value1, 'allowed': ['TrainingSampler', 'RepeatFactorTrainingSampler']} - elif key == "strategies": + elif key == "strategy": schema[key] = {'type': value1, 'allowed': ['uncertainty', 'certainty', 'random']} elif key == "mode": schema[key] = {'type': value1, 'allowed': ['mean', 'min']} @@ -132,7 +132,7 @@ def check_pretrained_weights(field, value, error): if isinstance(value1, list): if not all(isinstance(v, known_types[value1[0]]) for v in value): error_list.update({key: ["not all items are of type: " + str(value1[0])]}) - if key == "strategies": + if key == "strategy": if not all(v in ['uncertainty', 'certainty', 'random'] for v in value): error_list.update({key: ["choose 1 of these 3 options: 'uncertainty', 'certainty', 'random'"]}) if key == "mode": @@ -187,12 +187,12 @@ def init_folders_and_files(config): resultsfolders = [] csv_names = [] - counts = Counter(config['strategies']) + counts = Counter(config['strategy']) counts = list(counts.values()) duplicates = any(x > 1 for x in counts) hybrid_count = 0 - for s, (strategy, mode, dropout_probability, mcd_iterations, pool_size) in enumerate(zip(config['strategies'], config['mode'], config['dropout_probability'], config['mcd_iterations'], config['pool_size'])): + for strategy, mode, dropout_probability, mcd_iterations, pool_size in zip(config['strategy'], config['mode'], config['dropout_probability'], config['mcd_iterations'], config['pool_size']): if duplicates: if isinstance(pool_size, list): hybrid_count += 1 @@ -226,25 +226,15 @@ def remove_initial_training_set(dataroot): os.remove(os.path.join(dataroot, "initial_train.txt")) -def store_initial_files(cfg, config, dataset_dicts_train_init, val_value_init, weightsfolders): - for wf in range(len(weightsfolders)): - with open(os.path.join(weightsfolders[wf], "cfg_init.yaml"), "w") as f1: - f1.write(cfg.dump()) - with open(os.path.join(weightsfolders[wf], 'val_value_init.pkl'), 'wb') as f2: - pickle.dump(val_value_init, f2) +def store_initial_val_value(val_value_init, weightsfolder): + with open(os.path.join(weightsfolder, 'val_value_init.pkl'), 'wb') as f1: + pickle.dump(val_value_init, f1) - with open(os.path.join(config['dataroot'], 'dataset_dicts_train_init.pkl'), 'wb') as f3: - pickle.dump(dataset_dicts_train_init, f3) - -def load_initial_files(config, weightsfolders): - cfg = get_cfg() - cfg.merge_from_file(os.path.join(weightsfolders[0], "cfg_init.yaml")) - with open(os.path.join(weightsfolders[0], 'val_value_init.pkl'), 'rb') as f1: +def load_initial_val_value(weightsfolder): + with open(os.path.join(weightsfolder, 'val_value_init.pkl'), 'rb') as f1: val_value_init = pickle.load(f1) - with open(os.path.join(config['dataroot'], 'dataset_dicts_train_init.pkl'), 'rb') as f2: - dataset_dicts_train_init = pickle.load(f2) - return cfg, dataset_dicts_train_init, val_value_init + return val_value_init def calculate_max_entropy(classes): @@ -309,13 +299,11 @@ def move_initial_train_dir(initial_train_dir, traindir, export): copyfile(os.path.join(initial_train_dir, cur_file), os.path.join(traindir, cur_file)) -def copy_initial_weight_file(read_folder, weightsfolders, iter): +def copy_initial_weight_file(read_folder, weightsfolder, iter): weight_file = "best_model_{:s}.pth".format(str(iter).zfill(3)) - for wf in range(1, len(weightsfolders)): - write_folder = weightsfolders[wf] - check_direxcist(write_folder) - if os.path.exists(os.path.join(read_folder, weight_file)): - copyfile(os.path.join(read_folder, weight_file), os.path.join(write_folder, weight_file)) + check_direxcist(weightsfolder) + if os.path.exists(os.path.join(read_folder, weight_file)): + copyfile(os.path.join(read_folder, weight_file), os.path.join(weightsfolder, weight_file)) def copy_previous_weights(weights_folder, iteration): @@ -326,7 +314,7 @@ def copy_previous_weights(weights_folder, iteration): copyfile(previous_weights_file, next_weights_file) -def Train_MaskRCNN(config, weightsfolder, gpu_num, iter, val_value, dropout_probability, init): +def Train_MaskRCNN(config, weightsfolder, gpu_num, iter, val_value, dropout_probability, init=False, skip_training=False): ## Hook to automatically save the best checkpoint class BestCheckpointer(HookBase): def __init__(self, iter, eval_period, val_value, metric): @@ -459,9 +447,11 @@ def build_hooks(self): cfg.TEST.EVAL_PERIOD = config['eval_period'] cfg.MODEL.ROI_HEADS.NUM_CLASSES = len(config['classes']) os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) - trainer = CustomTrainer(cfg) - trainer.resume_or_load(resume=False) - trainer.train() + + if not skip_training: + trainer = CustomTrainer(cfg) + trainer.resume_or_load(resume=False) + trainer.train() try: val_value_output = trainer.storage._latest_scalars['highest_value'][0] @@ -471,7 +461,7 @@ def build_hooks(self): return cfg, dataset_dicts_train, val_value_output -def Eval_MaskRCNN(cfg, config, dataset_dicts_train, weightsfolder, resultsfolder, csv_name, iter, init): +def Eval_MaskRCNN(cfg, config, dataset_dicts_train, weightsfolder, resultsfolder, csv_name, iter, init=False): if init: register_coco_instances("test", {}, os.path.join(config['dataroot'], "test.json"), config['testdir']) test_metadata = MetadataCatalog.get("test") @@ -621,7 +611,7 @@ def random_pooling(pool_list, pool_size, cfg, config, max_entropy, mcd_iteration if not config_ok: sys.exit("Closing application") - config = process_config_file(config, ['strategies', 'mode', 'equal_pool_size', 'pool_size', 'dropout_probability', 'mcd_iterations', 'loops']) + config = process_config_file(config, ['strategy', 'mode', 'equal_pool_size', 'pool_size', 'dropout_probability', 'mcd_iterations', 'loops']) os.environ["CUDA_VISIBLE_DEVICES"] = config['cuda_visible_devices'] gpu_num = len(config['cuda_visible_devices']) check_direxcist(config['dataroot']) @@ -633,20 +623,27 @@ def random_pooling(pool_list, pool_size, cfg, config, max_entropy, mcd_iteration move_initial_train_dir(config['initial_train_dir'], config['traindir'], "images") prepare_initial_dataset(config) move_initial_train_dir(config['initial_train_dir'], config['traindir'], "annotations") + elif config['duplicate_initial_model_and_data']: + initial_train_files = read_train_file(config['initial_train_file']) + prepare_initial_dataset_from_list(config, initial_train_files) else: prepare_initial_dataset_randomly(config) ## active-learning - for i, (strategy, equal_pool_size, pool_size, mcd_iterations, mode, dropout_probability, loops, weightsfolder, resultsfolder, csv_name) in enumerate(zip(config['strategies'], config['equal_pool_size'], config['pool_size'], config['mcd_iterations'], config['mode'], config['dropout_probability'], config['loops'], weightsfolders, resultsfolders, csv_names)): - ## train and evaluate Mask R-CNN on the initial dataset - if i == 0: - cfg, dataset_dicts_train, val_value = Train_MaskRCNN(config, weightsfolder, gpu_num, 0, 0, dropout_probability, init=True) - cfg = Eval_MaskRCNN(cfg, config, dataset_dicts_train, weightsfolder, resultsfolder, csv_name, 0, init=True) + for strategy, equal_pool_size, pool_size, mcd_iterations, mode, dropout_probability, loops, weightsfolder, resultsfolder, csv_name in zip(config['strategy'], config['equal_pool_size'], config['pool_size'], config['mcd_iterations'], config['mode'], config['dropout_probability'], config['loops'], weightsfolders, resultsfolders, csv_names): + ## duplicate the initial model, when comparing the uncertainty sampling with the random sampling + if config['duplicate_initial_model_and_data']: + duplicated_weightsfolder = os.path.dirname(config['pretrained_weights']) + copy_initial_weight_file(duplicated_weightsfolder, weightsfolder, 0) + val_value_init = load_initial_val_value(duplicated_weightsfolder) + cfg, dataset_dicts_train, val_value = Train_MaskRCNN(config, weightsfolder, gpu_num, 1, val_value_init, dropout_probability, init=True, skip_training=True) + + ## train and evaluate Mask R-CNN on the randomly sampled initial dataset else: - initial_train_names = get_initial_train_names(config) - update_train_dataset(config, cfg, initial_train_names) - cfg, dataset_dicts_train, val_value = Train_MaskRCNN(config, weightsfolder, gpu_num, 0, 0, dropout_probability, init=False) - cfg = Eval_MaskRCNN(cfg, config, dataset_dicts_train, weightsfolder, resultsfolder, csv_name, 0, init=False) + cfg, dataset_dicts_train, val_value = Train_MaskRCNN(config, weightsfolder, gpu_num, 0, 0, dropout_probability, init=True) + store_initial_val_value(val_value, weightsfolder) + + cfg = Eval_MaskRCNN(cfg, config, dataset_dicts_train, weightsfolder, resultsfolder, csv_name, 0, init=True) train_names = get_train_names(dataset_dicts_train, config['traindir']) write_train_files(train_names, resultsfolder, 0) diff --git a/maskAL.yaml b/maskAL.yaml index c6d0f19..ca82bbb 100644 --- a/maskAL.yaml +++ b/maskAL.yaml @@ -6,12 +6,16 @@ weightsroot: "./weights" resultsroot: "./results" dataroot: "./datasets" -use_initial_train_dir: False initial_train_dir: "./datasets/initial_train" traindir: "./datasets/train" valdir: "./datasets/val" testdir: "./datasets/test" +# data options +use_initial_train_dir: False +duplicate_initial_model_and_data: False +initial_train_file: "" + # network parameters network_config: COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml pretrained_weights: COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml @@ -45,7 +49,7 @@ nms_threshold: 0.2 # active-learning sampling experiment_name: 'exp1' -strategies: 'uncertainty' +strategy: 'uncertainty' mode: 'mean' initial_datasize: 100 pool_size: 200 diff --git a/types.yaml b/types.yaml index 1bd1d43..102e504 100644 --- a/types.yaml +++ b/types.yaml @@ -4,12 +4,16 @@ weightsroot: 'string' resultsroot: 'string' dataroot: 'string' -use_initial_train_dir: 'boolean' initial_train_dir: 'string' traindir: 'string' valdir: 'string' testdir: 'string' +# data options +use_initial_train_dir: 'boolean' +duplicate_initial_model_and_data: 'boolean' +initial_train_file: 'string' + # network parameters network_config: 'string' pretrained_weights: 'string' @@ -43,7 +47,7 @@ nms_threshold: 'float' # active-learning sampling experiment_name: 'string' -strategies: ['string', 'list'] +strategy: ['string', 'list'] mode: ['string', 'list'] initial_datasize: 'integer' pool_size: ['integer', 'list']