Merge branch 'main' into stable

brsynth · Nov 21, 2024 · 572d055 · 572d055
2 parents 01c0dc6 + e4dfbab
commit 572d055
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 48 deletions.
diff --git a/environment.yaml b/environment.yaml
@@ -7,3 +7,5 @@ dependencies:
   - numpy
   - pyDOE2
   - openpyxl
+  - statsmodels
+  - matplotlib
diff --git a/icfree/learner/__init__.py b/icfree/learner/__init__.py
@@ -1 +0,0 @@
-

diff --git a/icfree/learner/calibrator.py b/icfree/learner/calibrator.py
@@ -36,11 +36,11 @@ def fit_regression_with_outlier_removal(y: np.ndarray, y_ref: np.ndarray, r2_lim
     max_outliers = int(0.3 * len(y))  # 30% of data points can be considered outliers
     current_r2 = 0
     num_outliers_removed = 0
-
+
+    original_indices = np.arange(len(y))
     outlier_indices = []
 
     while current_r2 <= r2_limit and num_outliers_removed < max_outliers:
-        print(f"Current R²: {current_r2:.2f}, r2_limit: {r2_limit:.2f}, Outliers removed: {num_outliers_removed}")
         # Add a constant term for OLS
         X = sm.add_constant(y)
         model = sm.OLS(y_ref, X).fit()
@@ -52,11 +52,14 @@ def fit_regression_with_outlier_removal(y: np.ndarray, y_ref: np.ndarray, r2_lim
 
         # Identify the index of the maximum Cook's distance
         max_cooks_index = np.argmax(cooks_d)
-
-        # Add the index to outlier list and remove it from the data
-        outlier_indices.append(max_cooks_index)
+
+        # Record the original index of the outlier
+        outlier_indices.append(original_indices[max_cooks_index])
+
+        # Remove the outlier from the data
         y = np.delete(y, max_cooks_index)
         y_ref = np.delete(y_ref, max_cooks_index)
+        original_indices = np.delete(original_indices, max_cooks_index)
         num_outliers_removed += 1
 
     # Fit the final model
@@ -156,67 +159,53 @@ def save_data(data: pd.DataFrame, output_file: str):
 if __name__ == "__main__":
     # Set up argument parsing
     parser = argparse.ArgumentParser(description='Calculate yield based on fluorescence data and optionally apply calibration.')
-    parser.add_argument('--file', type=str, required=True, help='Path to the input file (.csv or .xlsx)')
+    parser.add_argument('file', type=str, help='Path to the input file (.csv or .xlsx)')
+    parser.add_argument('ref_file', type=str, help='Path to the reference input file (.csv or .xlsx)')
     parser.add_argument('--jove_plus', type=int, required=True, help='Line number for Jove+ (1-based index)')
     parser.add_argument('--jove_minus', type=int, required=True, help='Line number for Jove- (1-based index)')
     parser.add_argument('--r2_limit', type=float, default=0.8, help='R-squared limit for the regression (default: 0.8)')
-    parser.add_argument('--ref_file', type=str, help='Path to the reference input file (.csv or .xlsx)')
     parser.add_argument('--output', type=str, required=True, help='Output file name (.csv or .xlsx)')
     parser.add_argument('--plot', type=str, help='Output PNG file name for the plot of calibrated points')
-    parser.add_argument('--num_control_points', type=int, default=10, help='Number of control points to select (default: 5)')
+    parser.add_argument('--num_control_points', type=int, default=5, help='Number of control points to select (default: 5)')
 
     args = parser.parse_args()
 
     # Load the data from the input file
     input_data = load_data(args.file)
+    ref_data = load_data(args.ref_file)
 
-    # Calculate the yield and get the modified DataFrame
-    modified_data = calculate_yield(input_data, args.jove_plus, args.jove_minus)
+    # Calculate yields for the input file if they do not exist
+    if not any('Yield' in col for col in input_data.columns):
+        input_data = calculate_yield(input_data, args.jove_plus, args.jove_minus)
 
-    # Detect component columns
-    component_columns = detect_component_columns(modified_data)
+    # Check if the reference file contains "Yield" columns
+    if not any('Yield' in col for col in ref_data.columns):
+        print("Error: The reference file must contain 'Yield #' columns. Please ensure the reference file includes these columns and try again.")
+        exit(1)
 
-    # Check if a reference file is provided
-    if args.ref_file:
-        # Load the reference data
-        ref_data = load_data(args.ref_file)
+    # Detect component columns
+    component_columns = detect_component_columns(input_data)
 
-        # Find matching indices based on component combinations
-        matching_input_indices, matching_ref_indices = find_matching_indices(modified_data, ref_data, component_columns)
+    # Find matching indices based on component combinations
+    matching_input_indices, matching_ref_indices = find_matching_indices(input_data, ref_data, component_columns)
 
-        # Compute average yields for matching component combinations
-        avg_yield, avg_yield_ref = compute_average_yields(modified_data, ref_data, matching_input_indices, matching_ref_indices)
+    # Compute average yields for matching component combinations
+    avg_yield, avg_yield_ref = compute_average_yields(input_data, ref_data, matching_input_indices, matching_ref_indices)
 
-        # Fit the regression with outlier removal on average yields
-        a, b, r2_value, outlier_indices = fit_regression_with_outlier_removal(avg_yield, avg_yield_ref, args.r2_limit)
+    # Fit the regression with outlier removal on average yields
+    a, b, r2_value, outlier_indices = fit_regression_with_outlier_removal(avg_yield, avg_yield_ref, args.r2_limit)
 
-        # Display the regression coefficients and R² value in the terminal
-        print(f"Regression Line: y = {a:.2f}x + {b:.2f}")
-        print(f"R² Value: {r2_value:.2f}")
+    # Display the regression coefficients and R² value in the terminal
+    print(f"Regression Line: y = {a:.2f}x + {b:.2f}")
+    print(f"R² Value: {r2_value:.2f}")
 
-        # Add calibrated yield columns
-        calibrated_data = add_calibrated_yield(modified_data, a, b)
+    # Add calibrated yield columns
+    calibrated_data = add_calibrated_yield(input_data, a, b)
 
-        # Plot the calibrated points with outliers and regression line if requested
-        if args.plot:
-            plot_calibrated_points(avg_yield, avg_yield_ref, outlier_indices, a, b, r2_value, args.plot, args.file, args.ref_file)
-    else:
-        # If no reference file is provided, just use the original
-        calibrated_data = modified_data
+    # Plot the calibrated points with outliers and regression line if requested
+    if args.plot:
+        plot_calibrated_points(avg_yield, avg_yield_ref, outlier_indices, a, b, r2_value, args.plot, args.file, args.ref_file)
 
     # Save the modified DataFrame to the specified output file
     save_data(calibrated_data, args.output)
-    print(f"Calibrated yields saved in {args.output}")
-    if args.plot:
-        print(f"Plot saved as {args.plot}")
-
-    # Select control points
-    jove_plus_index = args.jove_plus - 2
-    jove_minus_index = args.jove_minus - 2
-    control_data = select_control_points(modified_data, jove_plus_index, jove_minus_index, args.num_control_points)
-
-    # Save the new control points
-    outf = os.path.splitext(args.output)[0] + '_control_points.csv'
-    save_data(control_data, outf)
-    print(f"New control points saved in {outf}")
-
+    print(f"Modified file saved as {args.output}")
diff --git a/tests/data/learner/calibrator/input/~$plate3.xlsx b/tests/data/learner/calibrator/input/~$plate3.xlsx
diff --git a/tests/learner/test_calibrator.py b/tests/learner/test_calibrator.py
@@ -0,0 +1,58 @@
+import unittest
+import pandas as pd
+import numpy as np
+from icfree.learner.calibrator import calculate_yield, add_calibrated_yield, fit_regression_with_outlier_removal
+
+class TestCalibrator(unittest.TestCase):
+    def setUp(self):
+        # Create sample data for testing
+        self.data = pd.DataFrame({
+            'Fluorescence_1': [100, 200, 300, 400],
+            'Fluorescence_2': [150, 250, 350, 450]
+        })
+        self.jove_plus_line = 5
+        self.jove_minus_line = 2
+        self.a = 1.5
+        self.b = 0.5
+        self.y = np.array([1, 2, 3, 4, 5])
+        self.y_ref = np.array([1.2, 1.9, 3.1, 4.0, 5.1])
+        self.r2_limit = 0.95
+
+    def test_calculate_yield(self):
+        # Test the calculate_yield function
+        result = calculate_yield(self.data, self.jove_plus_line, self.jove_minus_line)
+        expected_columns = ['Fluorescence_1', 'Fluorescence_2', 'Yield_1', 'Yield_2']
+        self.assertTrue(all([col in result.columns for col in expected_columns]))
+
+        # Check if yields are calculated correctly
+        # Autofluorescence is the mean of fluorescences for jove_minus_line
+        autofluorescence = np.mean([self.data[fluo][self.jove_minus_line-2] for fluo in self.data if 'Fluorescence' in fluo])
+        # Reference is the mean of fluorescences for jove_plus_line
+        reference = np.mean([self.data[fluo][self.jove_plus_line-2] for fluo in self.data if 'Fluorescence' in fluo])
+        expected_yield_1 = (self.data['Fluorescence_1'] - autofluorescence) / (reference - autofluorescence)
+        pd.testing.assert_series_equal(result['Yield_1'], expected_yield_1, check_names=False)
+
+    def test_add_calibrated_yield(self):
+        # Test the add_calibrated_yield function
+        yield_data = calculate_yield(self.data, self.jove_plus_line, self.jove_minus_line)
+        result = add_calibrated_yield(yield_data, self.a, self.b)
+        expected_columns = ['Calibrated Yield_1', 'Calibrated Yield_2']
+        self.assertTrue(all([col in result.columns for col in expected_columns]))
+
+        # Check if calibrated yields are calculated correctly
+        expected_calibrated_yield_1 = self.a * result['Yield_1'] + self.b
+        pd.testing.assert_series_equal(result['Calibrated Yield_1'], expected_calibrated_yield_1, check_names=False)
+
+    def test_fit_regression_with_outlier_removal(self):
+        # Test the fit_regression_with_outlier_removal function
+        a, b, r2_value, outliers = fit_regression_with_outlier_removal(self.y, self.y_ref, self.r2_limit)
+
+        # Check if the regression coefficients and R2 value are within expected limits
+        self.assertIsInstance(a, float)
+        self.assertIsInstance(b, float)
+        self.assertGreaterEqual(r2_value, self.r2_limit)
+        self.assertIsInstance(outliers, list)
+        self.assertTrue(all(isinstance(i, np.int64) for i in outliers))
+
+if __name__ == '__main__':
+    unittest.main()