feat(learner/extractor): infer num replicates from initial file

brsynth · Nov 28, 2024 · c983045 · c983045
1 parent 047a99e
commit c983045
Show file tree

Hide file tree

Showing 26 changed files with 2,101 additions and 428 deletions.
diff --git a/icfree/learner/extractor.py b/icfree/learner/extractor.py
@@ -1,29 +1,77 @@
 import pandas as pd
 import argparse
 
-def find_n_m(sampling_file_path):
+def find_n_m_from_sampling(sampling_file_path):
     """
-    Find the number of unique combinations (n) and the number of replicates (m) from a sampling file.
+    Find the number of unique combinations (n) and determine if there are repetitions in the sampling file.
 
     Parameters:
     sampling_file_path (str): Path to the sampling file (Excel, CSV, or TSV).
 
     Returns:
-    tuple: A tuple containing the number of unique combinations (n) and the number of replicates (m).
+    tuple: Number of unique combinations (n) and a boolean indicating if repetitions exist.
     """
+    file_extension = sampling_file_path.split('.')[-1].lower()
+
     # Load the sampling file
-    df_sampling = pd.read_csv(sampling_file_path, sep='\t')
+    if file_extension == 'xlsx':
+        df_sampling = pd.read_excel(sampling_file_path)
+    elif file_extension == 'csv':
+        df_sampling = pd.read_csv(sampling_file_path)
+    elif file_extension == 'tsv':
+        df_sampling = pd.read_csv(sampling_file_path, sep='\t')
+    else:
+        raise ValueError("Unsupported file type. Please provide an Excel (.xlsx), CSV (.csv), or TSV (.tsv) file.")
 
     # Drop the unnamed column if it exists
     if df_sampling.columns[0].startswith('Unnamed'):
         df_sampling = df_sampling.drop(columns=df_sampling.columns[0])
 
-    # Identify unique combinations (n) and replicates (m)
     unique_combinations = df_sampling.drop_duplicates()
     n = unique_combinations.shape[0]
-    m = df_sampling.shape[0] // n
+    has_repetitions = df_sampling.shape[0] > n
+
+    return n, has_repetitions
+
+def infer_replicates(initial_data_file, sampling_file_path, num_samples):
+    """
+    Infer the number of replicates from the initial data file and sampling file.
+
+    Parameters:
+    initial_data_file (str): Path to the initial data file (Excel, CSV, or TSV).
+    sampling_file_path (str): Path to the sampling file (Excel, CSV, or TSV).
+    num_samples (int): Number of unique combinations (n).
+
+    Returns:
+    int: Inferred number of replicates.
+    """
+    # Load initial data file
+    file_extension = initial_data_file.split('.')[-1].lower()
+    if file_extension == 'xlsx':
+        df_initial = pd.read_excel(initial_data_file)
+    elif file_extension == 'csv':
+        df_initial = pd.read_csv(initial_data_file)
+    elif file_extension == 'tsv':
+        df_initial = pd.read_csv(initial_data_file, sep='\t')
+    else:
+        raise ValueError("Unsupported file type. Please provide an Excel (.xlsx), CSV (.csv), or TSV (.tsv) file.")
+
+    # Remove the first two columns
+    df_initial = df_initial.iloc[:, 2:]
+
+    # Infer replicates based on the number of columns
+    total_columns = df_initial.shape[1]
+
+    # Load sampling file to check for repetitions
+    _, has_repetitions = find_n_m_from_sampling(sampling_file_path)
 
-    return n, m
+    if has_repetitions:
+        num_replicates = total_columns // num_samples
+    else:
+        # Assume control wells if extra columns exist
+        num_replicates = total_columns // num_samples
+
+    return num_replicates
 
 def process_data(file_path, num_samples, num_replicates):
     """
@@ -37,10 +85,7 @@ def process_data(file_path, num_samples, num_replicates):
     Returns:
     tuple: A tuple containing the reshaped DataFrame and the sheet name (if applicable).
     """
-    # Determine the file extension
     file_extension = file_path.split('.')[-1].lower()
-
-    # Load the data based on file extension
     if file_extension == 'xlsx':
         excel_data = pd.ExcelFile(file_path)
         sheet_name = excel_data.sheet_names[0]
@@ -54,73 +99,21 @@ def process_data(file_path, num_samples, num_replicates):
     else:
         raise ValueError("Unsupported file type. Please provide an Excel (.xlsx), CSV (.csv), or TSV (.tsv) file.")
 
-    # Keep only the last row and remove the first two columns
-    df_last_row = df.iloc[[-1]].drop(columns=df.columns[:2])
+    # Remove the first two columns
+    df = df.iloc[:, 2:]
 
-    # Keep only the required number of values
+    # Reshape data based on num_samples and num_replicates
     total_values = num_samples * num_replicates
-    values_to_keep = df_last_row.values.flatten()[:total_values]
-
-    # Reshape the values into the specified number of columns and rows
+    values_to_keep = df.values.flatten()[:total_values]
     reshaped_values = values_to_keep.reshape((num_samples, num_replicates), order='F')
 
-    # Create a new DataFrame with the reshaped values
+    # Create reshaped DataFrame
     df_reshaped = pd.DataFrame(reshaped_values)
-
-    # Add headers to the reshaped DataFrame
-    headers = [f"Fluorescence Value {i+1}" for i in range(num_replicates)]
-    df_reshaped.columns = headers
-
-    # Add "Fluorescence Average" column
+    df_reshaped.columns = [f"Fluorescence Value {i+1}" for i in range(num_replicates)]
     df_reshaped["Fluorescence Average"] = df_reshaped.mean(axis=1)
 
     return df_reshaped, sheet_name
 
-def load_sampling_file(file_path, num_samples):
-    """
-    Load the sampling file and take only the first num_samples lines.
-
-    Parameters:
-    file_path (str): Path to the sampling file (Excel, CSV, or TSV).
-    num_samples (int): Number of samples to load.
-
-    Returns:
-    DataFrame: A DataFrame containing the first num_samples lines of the sampling file.
-    """
-    # Determine the file extension
-    file_extension = file_path.split('.')[-1].lower()
-
-    # Load the data based on file extension
-    if file_extension == 'xlsx':
-        df = pd.read_excel(file_path)
-    elif file_extension == 'csv':
-        df = pd.read_csv(file_path)
-    elif file_extension == 'tsv':
-        df = pd.read_csv(file_path, sep='\t')
-    else:
-        raise ValueError("Unsupported file type. Please provide an Excel (.xlsx), CSV (.csv), or TSV (.tsv) file.")
-
-    # Take only the first num_samples lines
-    df = df.head(num_samples)
-
-    return df
-
-def clean_sampling_file(df_sampling):
-    """
-    Clean the sampling file by removing the unnamed first column if it contains incrementing integers.
-
-    Parameters:
-    df_sampling (DataFrame): The sampling DataFrame to clean.
-
-    Returns:
-    DataFrame: The cleaned sampling DataFrame.
-    """
-    first_column = df_sampling.iloc[:, 0]
-    if first_column.name.startswith('Unnamed') and pd.api.types.is_integer_dtype(first_column):
-        if (first_column == range(len(first_column))).all():
-            df_sampling = df_sampling.drop(columns=first_column.name)
-    return df_sampling
-
 def process(initial_data_file, output_file_path, sampling_file, num_samples=None, num_replicates=None, display=True):
     """
     Process the initial data file and sampling file to combine the data and save the output.
@@ -129,30 +122,39 @@ def process(initial_data_file, output_file_path, sampling_file, num_samples=None
     initial_data_file (str): Path to the initial data file (Excel, CSV, or TSV).
     output_file_path (str): Path for the output file.
     sampling_file (str): Path to the sampling file (Excel, CSV, or TSV).
-    num_samples (int, optional): Number of samples. If not specified, it will be detected from the sampling file.
-    num_replicates (int, optional): Number of replicates. If not specified, it will be detected from the sampling file.
+    num_samples (int, optional): Number of samples. If not specified, inferred from sampling file.
+    num_replicates (int, optional): Number of replicates. If not specified, inferred from initial data file.
     display (bool, optional): Whether to display the combined data. Default is True.
 
     Returns:
     DataFrame: The combined DataFrame.
     """
     if num_samples is None or num_replicates is None:
-        n, m = find_n_m(sampling_file)
+        n, _ = find_n_m_from_sampling(sampling_file)
         num_samples = num_samples if num_samples is not None else n
-        num_replicates = num_replicates if num_replicates is not None else m
+        num_replicates = num_replicates if num_replicates is not None else infer_replicates(initial_data_file, sampling_file, num_samples)
 
     df_reshaped, sheet_name = process_data(initial_data_file, num_samples, num_replicates)
 
-    df_sampling = load_sampling_file(sampling_file, num_samples)
-    df_sampling_cleaned = clean_sampling_file(df_sampling)
-    df_combined = pd.concat([df_sampling_cleaned, df_reshaped], axis=1)
+    # Load sampling file
+    file_extension = sampling_file.split('.')[-1].lower()
+    if file_extension == 'xlsx':
+        df_sampling = pd.read_excel(sampling_file)
+    elif file_extension == 'csv':
+        df_sampling = pd.read_csv(sampling_file)
+    elif file_extension == 'tsv':
+        df_sampling = pd.read_csv(sampling_file, sep='\t')
+    else:
+        raise ValueError("Unsupported file type. Please provide an Excel (.xlsx), CSV (.csv), or TSV (.tsv) file.")
+
+    df_sampling = df_sampling.head(num_samples)
+    df_combined = pd.concat([df_sampling, df_reshaped], axis=1)
 
     if display:
         print(df_combined)
 
-    # Save the combined dataframe with headers and averages in the specified format
     if output_file_path.endswith('.xlsx'):
-        df_combined.to_excel(output_file_path, index=False, sheet_name=sheet_name)
+        df_combined.to_excel(output_file_path, index=False, sheet_name=sheet_name or "Sheet1")
     elif output_file_path.endswith('.csv'):
         df_combined.to_csv(output_file_path, index=False)
     elif output_file_path.endswith('.tsv'):
@@ -164,14 +166,13 @@ def process(initial_data_file, output_file_path, sampling_file, num_samples=None
     return df_combined
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Process and reshape fluorescence data.")
-    parser.add_argument("--initial_data_file", type=str, required=True, help="Path to the initial Excel, CSV, or TSV file.")
+    parser = argparse.ArgumentParser(description="Process fluorescence data.")
+    parser.add_argument("--initial_data_file", type=str, required=True, help="Path to the initial data file.")
     parser.add_argument("--output_file", type=str, required=True, help="Path for the output file.")
-    parser.add_argument("--sampling_file", type=str, required=True, help="Path to the sampling Excel, CSV, or TSV file.")
-    parser.add_argument("--num_samples", type=int, help="Number of samples.")
-    parser.add_argument("--num_replicates", type=int, help="Number of replicates.")
-    parser.add_argument("--no_display", action="store_true", help="Do not display the result.")
-
+    parser.add_argument("--sampling_file", type=str, required=True, help="Path to the sampling file.")
+    parser.add_argument("--num_samples", type=int, help="Number of samples (overrides detection).")
+    parser.add_argument("--num_replicates", type=int, help="Number of replicates (overrides detection).")
+    parser.add_argument("--no_display", action="store_true", help="Suppress displaying the combined data.")
     args = parser.parse_args()
-    
+
     process(args.initial_data_file, args.output_file, args.sampling_file, args.num_samples, args.num_replicates, not args.no_display)