diff --git a/environment.yaml b/environment.yaml index 618a287..389e6eb 100644 --- a/environment.yaml +++ b/environment.yaml @@ -7,3 +7,5 @@ dependencies: - numpy - pyDOE2 - openpyxl + - statsmodels + - matplotlib diff --git a/icfree/learner/__init__.py b/icfree/learner/__init__.py index 8b13789..e69de29 100644 --- a/icfree/learner/__init__.py +++ b/icfree/learner/__init__.py @@ -1 +0,0 @@ - diff --git a/icfree/learner/calibrator.py b/icfree/learner/calibrator.py index 398cbf4..55ff20c 100644 --- a/icfree/learner/calibrator.py +++ b/icfree/learner/calibrator.py @@ -36,11 +36,11 @@ def fit_regression_with_outlier_removal(y: np.ndarray, y_ref: np.ndarray, r2_lim max_outliers = int(0.3 * len(y)) # 30% of data points can be considered outliers current_r2 = 0 num_outliers_removed = 0 - + + original_indices = np.arange(len(y)) outlier_indices = [] while current_r2 <= r2_limit and num_outliers_removed < max_outliers: - print(f"Current R²: {current_r2:.2f}, r2_limit: {r2_limit:.2f}, Outliers removed: {num_outliers_removed}") # Add a constant term for OLS X = sm.add_constant(y) model = sm.OLS(y_ref, X).fit() @@ -52,11 +52,14 @@ def fit_regression_with_outlier_removal(y: np.ndarray, y_ref: np.ndarray, r2_lim # Identify the index of the maximum Cook's distance max_cooks_index = np.argmax(cooks_d) - - # Add the index to outlier list and remove it from the data - outlier_indices.append(max_cooks_index) + + # Record the original index of the outlier + outlier_indices.append(original_indices[max_cooks_index]) + + # Remove the outlier from the data y = np.delete(y, max_cooks_index) y_ref = np.delete(y_ref, max_cooks_index) + original_indices = np.delete(original_indices, max_cooks_index) num_outliers_removed += 1 # Fit the final model @@ -156,67 +159,53 @@ def save_data(data: pd.DataFrame, output_file: str): if __name__ == "__main__": # Set up argument parsing parser = argparse.ArgumentParser(description='Calculate yield based on fluorescence data and optionally apply calibration.') - parser.add_argument('--file', type=str, required=True, help='Path to the input file (.csv or .xlsx)') + parser.add_argument('file', type=str, help='Path to the input file (.csv or .xlsx)') + parser.add_argument('ref_file', type=str, help='Path to the reference input file (.csv or .xlsx)') parser.add_argument('--jove_plus', type=int, required=True, help='Line number for Jove+ (1-based index)') parser.add_argument('--jove_minus', type=int, required=True, help='Line number for Jove- (1-based index)') parser.add_argument('--r2_limit', type=float, default=0.8, help='R-squared limit for the regression (default: 0.8)') - parser.add_argument('--ref_file', type=str, help='Path to the reference input file (.csv or .xlsx)') parser.add_argument('--output', type=str, required=True, help='Output file name (.csv or .xlsx)') parser.add_argument('--plot', type=str, help='Output PNG file name for the plot of calibrated points') - parser.add_argument('--num_control_points', type=int, default=10, help='Number of control points to select (default: 5)') + parser.add_argument('--num_control_points', type=int, default=5, help='Number of control points to select (default: 5)') args = parser.parse_args() # Load the data from the input file input_data = load_data(args.file) + ref_data = load_data(args.ref_file) - # Calculate the yield and get the modified DataFrame - modified_data = calculate_yield(input_data, args.jove_plus, args.jove_minus) + # Calculate yields for the input file if they do not exist + if not any('Yield' in col for col in input_data.columns): + input_data = calculate_yield(input_data, args.jove_plus, args.jove_minus) - # Detect component columns - component_columns = detect_component_columns(modified_data) + # Check if the reference file contains "Yield" columns + if not any('Yield' in col for col in ref_data.columns): + print("Error: The reference file must contain 'Yield #' columns. Please ensure the reference file includes these columns and try again.") + exit(1) - # Check if a reference file is provided - if args.ref_file: - # Load the reference data - ref_data = load_data(args.ref_file) + # Detect component columns + component_columns = detect_component_columns(input_data) - # Find matching indices based on component combinations - matching_input_indices, matching_ref_indices = find_matching_indices(modified_data, ref_data, component_columns) + # Find matching indices based on component combinations + matching_input_indices, matching_ref_indices = find_matching_indices(input_data, ref_data, component_columns) - # Compute average yields for matching component combinations - avg_yield, avg_yield_ref = compute_average_yields(modified_data, ref_data, matching_input_indices, matching_ref_indices) + # Compute average yields for matching component combinations + avg_yield, avg_yield_ref = compute_average_yields(input_data, ref_data, matching_input_indices, matching_ref_indices) - # Fit the regression with outlier removal on average yields - a, b, r2_value, outlier_indices = fit_regression_with_outlier_removal(avg_yield, avg_yield_ref, args.r2_limit) + # Fit the regression with outlier removal on average yields + a, b, r2_value, outlier_indices = fit_regression_with_outlier_removal(avg_yield, avg_yield_ref, args.r2_limit) - # Display the regression coefficients and R² value in the terminal - print(f"Regression Line: y = {a:.2f}x + {b:.2f}") - print(f"R² Value: {r2_value:.2f}") + # Display the regression coefficients and R² value in the terminal + print(f"Regression Line: y = {a:.2f}x + {b:.2f}") + print(f"R² Value: {r2_value:.2f}") - # Add calibrated yield columns - calibrated_data = add_calibrated_yield(modified_data, a, b) + # Add calibrated yield columns + calibrated_data = add_calibrated_yield(input_data, a, b) - # Plot the calibrated points with outliers and regression line if requested - if args.plot: - plot_calibrated_points(avg_yield, avg_yield_ref, outlier_indices, a, b, r2_value, args.plot, args.file, args.ref_file) - else: - # If no reference file is provided, just use the original - calibrated_data = modified_data + # Plot the calibrated points with outliers and regression line if requested + if args.plot: + plot_calibrated_points(avg_yield, avg_yield_ref, outlier_indices, a, b, r2_value, args.plot, args.file, args.ref_file) # Save the modified DataFrame to the specified output file save_data(calibrated_data, args.output) - print(f"Calibrated yields saved in {args.output}") - if args.plot: - print(f"Plot saved as {args.plot}") - - # Select control points - jove_plus_index = args.jove_plus - 2 - jove_minus_index = args.jove_minus - 2 - control_data = select_control_points(modified_data, jove_plus_index, jove_minus_index, args.num_control_points) - - # Save the new control points - outf = os.path.splitext(args.output)[0] + '_control_points.csv' - save_data(control_data, outf) - print(f"New control points saved in {outf}") - + print(f"Modified file saved as {args.output}") diff --git a/tests/data/learner/calibrator/input/~$plate3.xlsx b/tests/data/learner/calibrator/input/~$plate3.xlsx deleted file mode 100644 index 5a93205..0000000 Binary files a/tests/data/learner/calibrator/input/~$plate3.xlsx and /dev/null differ diff --git a/tests/learner/test_calibrator.py b/tests/learner/test_calibrator.py new file mode 100644 index 0000000..7830b06 --- /dev/null +++ b/tests/learner/test_calibrator.py @@ -0,0 +1,58 @@ +import unittest +import pandas as pd +import numpy as np +from icfree.learner.calibrator import calculate_yield, add_calibrated_yield, fit_regression_with_outlier_removal + +class TestCalibrator(unittest.TestCase): + def setUp(self): + # Create sample data for testing + self.data = pd.DataFrame({ + 'Fluorescence_1': [100, 200, 300, 400], + 'Fluorescence_2': [150, 250, 350, 450] + }) + self.jove_plus_line = 5 + self.jove_minus_line = 2 + self.a = 1.5 + self.b = 0.5 + self.y = np.array([1, 2, 3, 4, 5]) + self.y_ref = np.array([1.2, 1.9, 3.1, 4.0, 5.1]) + self.r2_limit = 0.95 + + def test_calculate_yield(self): + # Test the calculate_yield function + result = calculate_yield(self.data, self.jove_plus_line, self.jove_minus_line) + expected_columns = ['Fluorescence_1', 'Fluorescence_2', 'Yield_1', 'Yield_2'] + self.assertTrue(all([col in result.columns for col in expected_columns])) + + # Check if yields are calculated correctly + # Autofluorescence is the mean of fluorescences for jove_minus_line + autofluorescence = np.mean([self.data[fluo][self.jove_minus_line-2] for fluo in self.data if 'Fluorescence' in fluo]) + # Reference is the mean of fluorescences for jove_plus_line + reference = np.mean([self.data[fluo][self.jove_plus_line-2] for fluo in self.data if 'Fluorescence' in fluo]) + expected_yield_1 = (self.data['Fluorescence_1'] - autofluorescence) / (reference - autofluorescence) + pd.testing.assert_series_equal(result['Yield_1'], expected_yield_1, check_names=False) + + def test_add_calibrated_yield(self): + # Test the add_calibrated_yield function + yield_data = calculate_yield(self.data, self.jove_plus_line, self.jove_minus_line) + result = add_calibrated_yield(yield_data, self.a, self.b) + expected_columns = ['Calibrated Yield_1', 'Calibrated Yield_2'] + self.assertTrue(all([col in result.columns for col in expected_columns])) + + # Check if calibrated yields are calculated correctly + expected_calibrated_yield_1 = self.a * result['Yield_1'] + self.b + pd.testing.assert_series_equal(result['Calibrated Yield_1'], expected_calibrated_yield_1, check_names=False) + + def test_fit_regression_with_outlier_removal(self): + # Test the fit_regression_with_outlier_removal function + a, b, r2_value, outliers = fit_regression_with_outlier_removal(self.y, self.y_ref, self.r2_limit) + + # Check if the regression coefficients and R2 value are within expected limits + self.assertIsInstance(a, float) + self.assertIsInstance(b, float) + self.assertGreaterEqual(r2_value, self.r2_limit) + self.assertIsInstance(outliers, list) + self.assertTrue(all(isinstance(i, np.int64) for i in outliers)) + +if __name__ == '__main__': + unittest.main()