pytorch · karlo-con-k · Apr 19, 2025 · Apr 19, 2025 · Apr 19, 2025 · Apr 19, 2025
diff --git a/u_net/data_set.py b/u_net/data_set.py
@@ -0,0 +1,105 @@
+"""
+    This file contains the class DataSetImgToImg, which is a PyTorch Dataset for loading images and their corresponding masks.
+    The class is designed to work with image-to-image translation tasks, such as segmentation or style transfer.
+"""
+import os
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset
+import torchvision.transforms as transforms
+
+
+identity_transform = transforms.Compose([
+                        #* ToPILImage = cahnge the data type from PyTorch tensor or a NumPy ndarray to : A PIL (Python Imaging Library)
+                        transforms.ToPILImage(),
+                        #* change the data type from Numpy or PIL to tensor
+                        transforms.ToTensor()
+                    ])
+
+
+class DataSetImgToImg(Dataset):
+    '''
+        A dataset of models from images to img. We need the path of the img 
+        input of the model and img outPut i.e path of img folder and mask folder.
+
+        Attributes
+        ----------
+            data : list[tuple]
+                The list of tuples with img name, and mask. Example 
+                [(in_put_img.1.jpg, out_put_img1.1.jpg)].
+            root_data : list[str, str]
+                The root_data[0] is a path to the data images folder for the inPut.
+                The root_data[1] is a path to the data images folder for the outPut.
+            trans_for_in_put_img  : torchvision.transforms.Compose, optional
+                Transformation for the inPut images
+            trans_for_out_put_img : torchvision.transforms.Compose, optional
+                Transformation for the outPut images
+            test : bool
+                If is true we will return a dataset of 'data_size' elements for do testing
+        Methods
+        -------
+            __getitem__(index):
+                Fetches and transforms the input and output images at the specified index.
+
+            __len__(void) -> int:
+                Return size of the data set.
+    '''
+
+    def __init__(
+            self, 
+            root_data,
+            trans_for_in_img  = identity_transform, 
+            trans_for_out_img = identity_transform, 
+            test     = False, 
+            data_size = 100
+    ):
+        super(DataSetImgToImg, self).__init__()
+
+        self.data = []
+        self.root_data   = root_data
+        self.trans_for_in_put_img  = trans_for_in_img
+        self.trans_for_out_put_img = trans_for_out_img
+        self.test = test
+
+        #* Create a list of the name of the files in the root_datas.
+        #TODO if the names are diferents this do now work well
+        in_put_images  = os.listdir(self.root_data[0])
+        out_put_images = os.listdir(self.root_data[1])
+
+        if(len(in_put_images) != len(out_put_images)):
+            print("len(in_put_images) != len(out_put_images)")
+
+        if(self.test == True):
+            data_size = min(len(in_put_images), data_size)
+            in_put_images  =  in_put_images[0:data_size]
+            out_put_images = out_put_images[0:data_size]
+
+        #* Save a list of tuples like [(in_put_img.1.jpg, out_put_img1.1.jpg)]
+        self.data = list(zip(in_put_images, out_put_images))
+        print("Size data set lower definition", len(in_put_images))
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+
+        #* Read the images, transform them into an array, and only use the first 3 channels.
+        in_put_img_file = self.data[index][0]
+        in_put_img_pth = os.path.join(self.root_data[0] + '/' + in_put_img_file)
+        in_put_img  = np.array(Image.open(in_put_img_pth))
+        in_put_img  = in_put_img [:, :, :3]
+
+        out_put_img_file = self.data[index][1]
+        out_put_img_pth = os.path.join(self.root_data[1] + '/' + out_put_img_file)
+        out_put_img  = np.array(Image.open(out_put_img_pth))
+        # chanels = min(np.array(Image.open(out_put_img_pth)).shape[0], 1)
+
+        out_put_img  = out_put_img [:, :]
+
+        #* Apply the corresponding trasformations. Could be data aumentation functions
+        in_put_img  = self.trans_for_in_put_img(in_put_img)
+        out_put_img = self.trans_for_out_put_img(out_put_img)
+
+
+        return in_put_img, out_put_img
+
diff --git a/u_net/dataset_preparation.py b/u_net/dataset_preparation.py
@@ -0,0 +1,154 @@
+'''	
+    This script is used to prepare the Brain MRI segmentation dataset for training a model 
+    image to image. It creates the following folders within 'new_folders_path': train/img, 
+    train/mask, validation/img, and validation/mask. After that the function copies the 
+    original images into train/img and train/mask. When copying the mask imgaes, "_mask" 
+    is removed from their filenames to match with the images filesnames (this is necesary 
+    for  use tiwh the DataSetImgToImg class). Subsequently, the data is split into 80% for 
+    training and 20% for validation. The function 'preparation_brain_MRI_set' is the 
+    main function that takes the path of the dataset and the new folder path as input arguments.    
+    It creates the necessary folders and copies the images into them. The function 'split_data_set' 
+    is used to split the dataset into training and validation sets. The function 'copy_in_new_file' 
+    is used to copy the images from one folder to another and remove the old file if specified.
+'''
+
+import os
+import random
+
+
+def copy_in_new_file(
+        old_path : str, 
+        new_path : str, 
+        img_name : str, 
+        delete_old_file : bool = False,
+) -> None:
+    '''
+        This function copy the img from  direction old_path/img_name in to new_path/img_name2
+        where img_name2 = img_name with out "_mask" string. If delete_old_file = true the old
+        img will be remove. 
+
+        Args
+        ----
+            old_path : str
+                The folder where is the img.
+            new_path : str
+                The folder where we will do the copy of the img.
+            img_name : str
+                The img name.
+            delete_old_file : bool = False
+                delete_old_file = true if we what to delete the old file
+    '''
+
+    with open(old_path + '/' + img_name, "rb") as f:
+        img_copy = f.read()
+
+    if("mask" in img_name):
+        img_name = img_name.replace("_mask", "")
+
+    with open(new_path + '/' + img_name, "wb") as f:
+
+        f.write(img_copy)
+
+    if delete_old_file == True:
+        #TODO test
+        os.remove(old_path)
+
+
+def split_data_set(new_folders_path : str) -> None:
+    '''
+        This function that split the data set in to train and validation folders. 
+        The validation set will be the 20% of the origen data set. We need train
+        and validation folders in new_folders_path and img, mask folders in train
+        and validation respectivy. And the 100 fo the data set in train. Like this:
+
+        new_folders_path/
+        |
+        |--train/
+        |    |
+        |    |- img      
+        |    |
+        |    L mask      
+        |
+        |--validation
+        |    |
+        |    |- img      
+        |    |
+        |    L mask
+
+        Args:
+        -----
+            new_folders_path : str
+                The folder where is the data set with the folders train with img folder and mask folder
+                and the folder validation with the folders img and mask (like the diagram).
+    '''
+
+    files_img  = os.listdir(new_folders_path + "/train/img")
+    files_mask = os.listdir(new_folders_path + "/train/mask")
+    data_set_size = len(files_img)     #* number of pairs (img, mask) in the data set
+    validation_size = data_set_size//5 #todo add the arg porcentage.
+
+    for _ in range(validation_size):
+
+        img_name = random.choice(files_img)
+        old_path = new_folders_path + "/train/img" 
+        new_path = new_folders_path + "/validation/img"
+        copy_in_new_file(old_path = old_path, new_path = new_path, img_name = img_name)
+        files_img.remove(img_name)
+
+        old_path = new_folders_path + "/train/mask" 
+        new_path = new_folders_path + "/validation/mask"
+        copy_in_new_file(old_path = old_path, new_path = new_path, img_name = img_name)
+        files_mask.remove(img_name)
+
+    print("validation_size = ", validation_size)
+    print("trainSize = "     , len(files_img))
+
+
+def preparation_brain_MRI_set(
+        path_data_set : str, 
+        new_folders_path : str,
+) -> None:
+    '''
+        This function prepare the Brain MRI segmentation dataset in the specified
+        'new_folders_path' for training a model image to image. It creates the 
+        following folders within 'new_folders_path': train/img, train/mask, 
+        validation/img, and validation/mask. After that the function copies the
+        original images into train/img and train/mask. When copying the mask 
+        imgaes, "_mask" is removed from their filenames to match with the images 
+        filesnames (this is necesary for  use tiwh the DataSetImgToImg class).
+        Subsequently, the data is split into 80% for training and 20% for validation.
+
+        Args:
+        -----
+            path_data_set : str
+                The foder of the data set Brain MRI 
+            new_folders_path : str
+                    The folder where we will create the folders train/img, train/mask \n
+                    validation/img, and validation/mask. For afther move the data set \n
+                    in the train folder, and after split the data set in train, and
+                    validation folders.
+    '''
+
+    folder_kaggle_3m = os.listdir(path_data_set)
+    path_data_set += '/' + folder_kaggle_3m[0]
+    folders_list  = os.listdir(path_data_set)[2:] #* ignore the red and csv
+
+    os.makedirs(new_folders_path + "/train/img" , exist_ok=True) #*Create the folders for save the imgs
+    os.makedirs(new_folders_path + "/train/mask", exist_ok=True) #*Create the folders for save the imgs
+    os.makedirs(new_folders_path + "/validation/img" , exist_ok=True) #*Create the folders for save the imgs
+    os.makedirs(new_folders_path + "/validation/mask", exist_ok=True) #*Create the folders for save the imgs
+
+    data_set_size = 0
+    #* Open all the img in the dataSet and separate in img and mask. 
+    for folder in (folders_list):
+        img_folder_list = os.listdir(path_data_set + '/' + folder)
+        for  img_name in (img_folder_list):
+            data_set_size += 1
+            if("mask" in img_name):
+                copy_in_new_file(path_data_set + '/' +  folder, new_folders_path + "/train/mask", img_name)
+            else:
+                copy_in_new_file(path_data_set + '/' +  folder, new_folders_path + "/train/img", img_name)
+
+    split_data_set(new_folders_path)
+
+