[ENH] Correct which data is loaded and missing

compneurobilbao · Nov 23, 2023 · 71d0b01 · 71d0b01
1 parent 2065069
commit 71d0b01
Show file tree

Hide file tree

Showing 2 changed files with 133 additions and 58 deletions.
diff --git a/src/ageml/messages.py b/src/ageml/messages.py
@@ -22,7 +22,7 @@
 output_long_description = "Path to output directory where to save results. (Required)"
 
 features_long_description = (
-    "Path to input CSV file containing features. (Required) \n"
+    "Path to input CSV file containing features. (Required: run age) \n"
     "In the file the first column should be the ID, the second column should be the AGE, \n"
     "and the following columns the features. The first row should be the header for \n"
     "column names."
@@ -37,15 +37,15 @@
 
 scaler_long_description = (
     "Scaler type and scaler parameters to use. First argument is the type and the following \n"
-    "arguments are input as keyword arguments into the scaler. They must be seperated by an =.\n"
+    "arguments are input as keyword arguments into scaler. They must be seperated by an =.\n"
     "Example: -m standard\n"
     "Available Types: standard (Default: standard)"
 )
 
 cv_long_description = (
-    "Number of CV splits with which to run the Cross Validation Scheme. Expect 1 or 2 integers. \n"
-    "First integer is the number of splits and the second is the seed for randomization. \n"
-    "Default: 5 0"
+    "Number of CV splits with which to run the Cross Validation Scheme. Expect 1 or 2 \n"
+    "integers. First integer is the number of splits and the second is the seed for \n"
+    "randomization. Default: 5 0"
 )
 
 covar_long_description = (
@@ -55,13 +55,13 @@
 )
 
 factors_long_description = (
-    "Path to input CSV file containing factors (e.g. liefstyle and environmental factors). \n"
+    "Path to input CSV file containing factors (Required: run lifestyle). \n"
     "In the file the first column should be the ID, the followins columns should be the \n"
     "factors. The first row should be the header for column names."
 )
 
 clinical_long_description = (
-    "Path to input CSV file containing health conditions. \n"
+    "Path to input CSV file containing conditions (Required: run clinical or classification).\n"
     "In the file, the first column should be the ID, the second column should be whether the \n"
     "subject is a CONTROL, and the following columns are binary variables for different \n"
     "conditions. The first row should be the header for column names."
@@ -75,6 +75,12 @@
     "(e.g. Brain Structure: White Matter Volume, Grey Matter Volume, VCSF Volume)"
 )
 
+ages_long_description = (
+    "Path to input CSV file containing the ages of the subjects. \n"
+    "In the file the first column should be the ID, the second column should be the age, \n"
+    "the third column should be the predicted age, fourth age is corrected age and last \n"
+    "column is the delta. The first row should be the header for column names."
+)
 # UI information
 
 emblem = """

diff --git a/src/ageml/ui.py b/src/ageml/ui.py
@@ -49,7 +49,7 @@ class Interface:
 
     load_csv(self, file): Use panda to load csv into dataframe.
 
-    load_data(self): Load data from csv files.
+    load_data(self, required): Load data from csv files.
 
     age_distribution(self, dfs, labels=None): Use visualizer to show age distribution.
 
@@ -138,13 +138,25 @@ def load_csv(self, file):
         else:
             return None
 
-    def load_data(self):
-        """Load data from csv files."""
+    def load_data(self, required=[]):
+        """Load data from csv files.
+        
+        Parameters
+        ----------
+        required: list of required files"""
+
+        # Load files
+        print('-----------------------------------')
+        print('Loading data...')
+
 
         # Load features
         self.df_features = self.load_csv(self.args.features)
-        if 'age' not in self.df_features.columns:
-            raise KeyError("Features file must contain a column name 'age', or any other case-insensitive variation.")
+        if self.df_features is not None:
+            if 'age' not in self.df_features.columns:
+                raise KeyError("Features file must contain a column name 'age', or any other case-insensitive variation.")
+        elif 'features' in required:
+            raise ValueError("Features file must be provided.")
 
         # Load covariates
         self.df_covariates = self.load_csv(self.args.covariates)
@@ -160,15 +172,64 @@ def load_data(self):
             else:
                 self.flags['CN'] = True
                 self.cn_subjects = self.df_clinical[self.df_clinical['cn']].index
-
-        # Remove subjects with missing features
-        self.subjects_missing_data = self.df_features[self.df_features.isnull().any(axis=1)].index.to_list()
+        elif 'clinical' in required:
+            raise ValueError("Clinical file must be provided.")
+
+        # Load ages
+        # Check if already has ages loaded 
+        if hasattr(self, 'df_ages'):
+            if self.df_ages is None:
+                self.df_ages = self.load_csv(self.args.ages)
+            else:
+                # Dont over write if None
+                df = self.load_csv(self.args.ages)
+                if df is not None:
+                    self.df_ages = df
+                    warning_message = "Ages file already loaded, overwriting with  %s provided file." \
+                                      % self.args.ages
+                    print(warning_message)
+                    warnings.warn(warning_message, category=UserWarning)
+        else:
+            self.df_ages = self.load_csv(self.args.ages)
+
+        # Check that ages file has required columns
+        if self.df_ages is not None:
+            cols = ['age', 'predicted age', 'corrected age', 'delta']
+            for col in cols:
+                if col not in self.df_ages.columns:
+                    raise KeyError("Clinical file must contian a column name %s" % col)
+
+        # Remove subjects with missing values
+        dfs = [self.df_features, self.df_covariates, self.df_factors, self.df_clinical, self.df_ages]
+        self.subjects_missing_data = []
+        for df in dfs:
+            if df is not None:    
+                self.subjects_missing_data = self.subjects_missing_data + df[df.isnull().any(axis=1)].index.to_list()
         if self.subjects_missing_data.__len__() != 0:
-            print('-----------------------------------')
             warn_message = 'Subjects with missing data: %s' % self.subjects_missing_data
             print(warn_message)
             warnings.warn(warn_message, category=UserWarning)
-        self.df_features.dropna(inplace=True)
+
+        # Check that all dataframes have the same subjects
+        non_shared_subjects = [] 
+        for i in range(len(dfs)):
+            for j in range(i+1, len(dfs)):
+                if dfs[i] is not None and dfs[j] is not None:
+                    # Find subjects in one dataframe but not the other
+                    non_shared_subjects = non_shared_subjects + \
+                                          [s for s in dfs[i].index.to_list() 
+                                           if s not in dfs[j].index.to_list()]
+        if non_shared_subjects.__len__() != 0:
+            warn_message = 'Subjects not shared between dataframes: %s' % non_shared_subjects
+            print(warn_message)
+            warnings.warn(warn_message, category=UserWarning)
+            self.subjects_missing_data = self.subjects_missing_data + non_shared_subjects
+
+        # Remove subjects with missing values
+        for df in dfs:
+            if df is not None:                
+                df.drop(self.subjects_missing_data, inplace=True, errors='ignore')            
+
 
     def age_distribution(self, dfs, labels=None, name=''):
         """Use visualizer to show age distribution.
@@ -249,10 +310,10 @@ def model_age(self, df, model):
 
         # Save to dataframe and csv
         data = np.stack((y, y_pred, y_corrected, deltas), axis=1)
-        cols = ['Age', 'Predicted Age', 'Corrected Age', 'Delta']
-        df_age = pd.DataFrame(data, index=df.index, columns=cols)
+        cols = ['age', 'predicted age', 'corrected age', 'delta']
+        df_ages = pd.DataFrame(data, index=df.index, columns=cols)
 
-        return model, df_age
+        return model, df_ages
 
     def predict_age(self, df, model):
         """Use AgeML to predict age with data."""
@@ -273,10 +334,10 @@ def predict_age(self, df, model):
 
         # Save to dataframe and csv
         data = np.stack((y, y_pred, y_corrected, deltas), axis=1)
-        cols = ['Age', 'Predicted Age', 'Corrected Age', 'Delta']
-        df_age = pd.DataFrame(data, index=df.index, columns=cols)
+        cols = ['age', 'predicted age', 'corrected age', 'delta']
+        df_ages = pd.DataFrame(data, index=df.index, columns=cols)
 
-        return df_age
+        return df_ages
 
     def deltas_by_group(self, df, labels):
 
@@ -287,7 +348,7 @@ def deltas_by_group(self, df, labels):
         # Obtain deltas means and stds
         deltas = []
         for i, df_group in enumerate(df):
-            deltas.append(df_group['Delta'].to_numpy())
+            deltas.append(df_group['delta'].to_numpy())
             print(labels[i])
             print('Mean delta: %.2f' % np.mean(deltas[i]))
             print('Std delta: %.2f' % np.std(deltas[i]))
@@ -309,14 +370,18 @@ def deltas_by_group(self, df, labels):
         self.visualizer.deltas_by_groups(deltas, labels)
 
     @log
+    def run_wrapper(self, run):
+        """Wrapper for running modelling with log."""
+        run()
+
     def run_age(self):
         """Run basic age modelling."""
 
         # Run age modelling
         print('Running age modelling...')
 
         # Load data
-        self.load_data()
+        self.load_data(required=['features'])
 
         # Select controls
         if self.flags['CN']:
@@ -331,54 +396,53 @@ def run_age(self):
         self.features_vs_age(df_cn)
 
         # Model age
-        self.ageml, df_age_cn = self.model_age(df_cn, self.ageml)
+        self.ageml, df_ages_cn = self.model_age(df_cn, self.ageml)
 
         # Apply to clinical data
         if self.flags['CN']:
             df_clinical = self.df_features.loc[~self.df_features.index.isin(self.cn_subjects)]
-            df_age_clinical = self.predict_age(df_clinical, self.ageml)
-            self.df_age = pd.concat([df_age_cn, df_age_clinical])
+            df_ages_clinical = self.predict_age(df_clinical, self.ageml)
+            self.df_ages = pd.concat([df_ages_cn, df_ages_clinical])
         else:
-            self.df_age = df_age_cn
+            self.df_ages = df_ages_cn
 
         # Save dataframe
-        self.df_age.to_csv(os.path.join(self.dir_path, 'predicted_age.csv'))
+        self.df_ages.to_csv(os.path.join(self.dir_path, 'predicted_age.csv'))
 
-    @log
     def run_lifestyle(self):
         """Run age modelling with lifestyle factors."""
 
         print('Running lifestyle factors...')
         pass
 
-    @log
     def run_clinical(self):
         """Run age modelling with clinical factors."""
 
         print('Running clinical outcomes...')
+
+        # Load data
+        self.load_data(required=['clinical'])
 
-        # Run age
-        print('No age models detected...')
-        print('-----------------------------------')
-        self.run_age()
-        print('-----------------------------------')
-        print('Resuming clinical outcomes...')
+        # Run age if not ages found
+        if self.df_ages is None:
+            print('No age data detected...')
+            print('-----------------------------------')
+            self.run_age()
+            print('-----------------------------------')
+            print('Resuming clinical outcomes...')
 
         # Obtain dataframes for each clinical group
         groups = self.df_clinical.columns.to_list()
-        group_features = []
         group_ages = []
         for g in groups:
-            group_features.append(self.df_features.loc[self.df_clinical[g]]) 
-            group_ages.append(self.df_age.loc[self.df_clinical[g]])
+            group_ages.append(self.df_ages.loc[self.df_clinical[g]])
 
         # Use visualizer to show age distribution
-        self.age_distribution(group_features, groups, name='clinical_groups')
+        self.age_distribution(group_ages, groups, name='clinical_groups')
 
         # Use visualizer to show box plots of deltas by group
         self.deltas_by_group(group_ages, groups)
 
-    @log
     def run_classification(self):
         """Run classification between two different clinical groups."""
 
@@ -411,23 +475,25 @@ def __init__(self):
         # Run modelling
         case = args.run
         if case == 'age':
-            self.run_age()
+            self.run = self.run_age
         elif case == 'lifestyle':
-            self.run_lifestyle()
+            self.run = self.run_lifestyle
         elif case == 'clinical':
-            self.run_clinical()
+            self.run = self.run_clinical
         elif case == 'classification':
-            self.run_classification()
+            self.run = self.run_classification
         else:
             raise ValueError('Choose a valid run type: age, lifestyle, clinical, classification')
 
+        self.run_wrapper(self.run)
+
     def configure_parser(self):
         """Configure parser with required arguments for processing."""
         self.parser.add_argument('-r', '--run', metavar='RUN', default='age', required=True,
                                  help=messages.run_long_description)
         self.parser.add_argument('-o', '--output', metavar='DIR', required=True,
                                  help=messages.output_long_description)
-        self.parser.add_argument('-f', "--features", metavar='FILE', required=True,
+        self.parser.add_argument('-f', "--features", metavar='FILE',
                                  help=messages.features_long_description)
         self.parser.add_argument('-m', '--model', nargs='*', default=['linear'],
                                  help=messages.model_long_description)
@@ -443,6 +509,8 @@ def configure_parser(self):
                                  help=messages.clinical_long_description)
         self.parser.add_argument("--systems", metavar='FILE',
                                  help=messages.systems_long_description)
+        self.parser.add_argument("--ages", metavar='FILE',
+                                 help=messages.ages_long_description)
 
     def configure_args(self, args):
         """Configure argumens with required fromatting for modelling.
@@ -566,16 +634,18 @@ def initial_command(self):
         print('Output directory path (Required):')
         self.force_command(self.output_command, 'o', required=True)
         # Ask for input files
-        print('Input features file path (Required):')
-        self.force_command(self.load_command, 'l --features', required=True)
+        print('Input features file path (Required for run age):')
+        self.force_command(self.load_command, 'l --features')
         print('Input covariates file path (Optional):')
         self.force_command(self.load_command, 'l --covariates')
-        print('Input factors file path (Optional):')
+        print('Input factors file path (Reqruired for run lifestyle):')
         self.force_command(self.load_command, 'l --factors')
-        print('Input clinical file path (Optional):')
+        print('Input clinical file path (Required for run clinical or run classification):')
         self.force_command(self.load_command, 'l --clinical')
         print('Input systems file path (Optional):')
         self.force_command(self.load_command, 'l --systems')
+        print('Input ages file path (Optional):')
+        self.force_command(self.load_command, 'l --ages')
 
         # Ask for scaler, model and CV parameters
         print('Scaler type and parameters (Default:standard):')
@@ -637,7 +707,7 @@ def command_interface(self):
             elif command == "r":
                 # Capture any error raised and print
                 try:
-                    self.run()
+                    self.run_wrapper(self.run)
                 except Exception as e:
                     print(e)
                     print('Error running modelling.')
@@ -731,7 +801,7 @@ def load_command(self):
             else:
                 if not self.check_file(file):
                     error = 'File %s not found.' % file
-                elif file_type in ['--features', '--covariates', '--factors', '--clinical']:
+                elif file_type in ['--features', '--covariates', '--factors', '--clinical', '--ages']:
                     if not file.endswith('.csv'):
                         error = 'File %s must be a .csv file.' % file
                 elif file_type == '--systems':
@@ -744,10 +814,7 @@ def load_command(self):
 
         # Set file path
         if file_type == '--features':
-            if file is None:
-                error = 'A features file must be provided must not be None.'
-            else:
-                self.args.features = file
+            self.args.features = file
         elif file_type == '--covariates':
             self.args.covariates = file
         elif file_type == '--factors':
@@ -756,6 +823,8 @@ def load_command(self):
             self.args.clinical = file
         elif file_type == '--systems':
             self.args.systems = file
+        elif file_type == '--ages':
+            self.args.ages = file
         else:
             error = 'Choose a valid file type: --features, --covariates, --factors, --clinical, --systems'