Skip to content

Commit

Permalink
Merge branch 'main' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
breakthewall committed Nov 21, 2024
2 parents 01c0dc6 + e4dfbab commit 572d055
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 48 deletions.
2 changes: 2 additions & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ dependencies:
- numpy
- pyDOE2
- openpyxl
- statsmodels
- matplotlib
1 change: 0 additions & 1 deletion icfree/learner/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@

83 changes: 36 additions & 47 deletions icfree/learner/calibrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ def fit_regression_with_outlier_removal(y: np.ndarray, y_ref: np.ndarray, r2_lim
max_outliers = int(0.3 * len(y)) # 30% of data points can be considered outliers
current_r2 = 0
num_outliers_removed = 0


original_indices = np.arange(len(y))
outlier_indices = []

while current_r2 <= r2_limit and num_outliers_removed < max_outliers:
print(f"Current R²: {current_r2:.2f}, r2_limit: {r2_limit:.2f}, Outliers removed: {num_outliers_removed}")
# Add a constant term for OLS
X = sm.add_constant(y)
model = sm.OLS(y_ref, X).fit()
Expand All @@ -52,11 +52,14 @@ def fit_regression_with_outlier_removal(y: np.ndarray, y_ref: np.ndarray, r2_lim

# Identify the index of the maximum Cook's distance
max_cooks_index = np.argmax(cooks_d)

# Add the index to outlier list and remove it from the data
outlier_indices.append(max_cooks_index)

# Record the original index of the outlier
outlier_indices.append(original_indices[max_cooks_index])

# Remove the outlier from the data
y = np.delete(y, max_cooks_index)
y_ref = np.delete(y_ref, max_cooks_index)
original_indices = np.delete(original_indices, max_cooks_index)
num_outliers_removed += 1

# Fit the final model
Expand Down Expand Up @@ -156,67 +159,53 @@ def save_data(data: pd.DataFrame, output_file: str):
if __name__ == "__main__":
# Set up argument parsing
parser = argparse.ArgumentParser(description='Calculate yield based on fluorescence data and optionally apply calibration.')
parser.add_argument('--file', type=str, required=True, help='Path to the input file (.csv or .xlsx)')
parser.add_argument('file', type=str, help='Path to the input file (.csv or .xlsx)')
parser.add_argument('ref_file', type=str, help='Path to the reference input file (.csv or .xlsx)')
parser.add_argument('--jove_plus', type=int, required=True, help='Line number for Jove+ (1-based index)')
parser.add_argument('--jove_minus', type=int, required=True, help='Line number for Jove- (1-based index)')
parser.add_argument('--r2_limit', type=float, default=0.8, help='R-squared limit for the regression (default: 0.8)')
parser.add_argument('--ref_file', type=str, help='Path to the reference input file (.csv or .xlsx)')
parser.add_argument('--output', type=str, required=True, help='Output file name (.csv or .xlsx)')
parser.add_argument('--plot', type=str, help='Output PNG file name for the plot of calibrated points')
parser.add_argument('--num_control_points', type=int, default=10, help='Number of control points to select (default: 5)')
parser.add_argument('--num_control_points', type=int, default=5, help='Number of control points to select (default: 5)')

args = parser.parse_args()

# Load the data from the input file
input_data = load_data(args.file)
ref_data = load_data(args.ref_file)

# Calculate the yield and get the modified DataFrame
modified_data = calculate_yield(input_data, args.jove_plus, args.jove_minus)
# Calculate yields for the input file if they do not exist
if not any('Yield' in col for col in input_data.columns):
input_data = calculate_yield(input_data, args.jove_plus, args.jove_minus)

# Detect component columns
component_columns = detect_component_columns(modified_data)
# Check if the reference file contains "Yield" columns
if not any('Yield' in col for col in ref_data.columns):
print("Error: The reference file must contain 'Yield #' columns. Please ensure the reference file includes these columns and try again.")
exit(1)

# Check if a reference file is provided
if args.ref_file:
# Load the reference data
ref_data = load_data(args.ref_file)
# Detect component columns
component_columns = detect_component_columns(input_data)

# Find matching indices based on component combinations
matching_input_indices, matching_ref_indices = find_matching_indices(modified_data, ref_data, component_columns)
# Find matching indices based on component combinations
matching_input_indices, matching_ref_indices = find_matching_indices(input_data, ref_data, component_columns)

# Compute average yields for matching component combinations
avg_yield, avg_yield_ref = compute_average_yields(modified_data, ref_data, matching_input_indices, matching_ref_indices)
# Compute average yields for matching component combinations
avg_yield, avg_yield_ref = compute_average_yields(input_data, ref_data, matching_input_indices, matching_ref_indices)

# Fit the regression with outlier removal on average yields
a, b, r2_value, outlier_indices = fit_regression_with_outlier_removal(avg_yield, avg_yield_ref, args.r2_limit)
# Fit the regression with outlier removal on average yields
a, b, r2_value, outlier_indices = fit_regression_with_outlier_removal(avg_yield, avg_yield_ref, args.r2_limit)

# Display the regression coefficients and R² value in the terminal
print(f"Regression Line: y = {a:.2f}x + {b:.2f}")
print(f"R² Value: {r2_value:.2f}")
# Display the regression coefficients and R² value in the terminal
print(f"Regression Line: y = {a:.2f}x + {b:.2f}")
print(f"R² Value: {r2_value:.2f}")

# Add calibrated yield columns
calibrated_data = add_calibrated_yield(modified_data, a, b)
# Add calibrated yield columns
calibrated_data = add_calibrated_yield(input_data, a, b)

# Plot the calibrated points with outliers and regression line if requested
if args.plot:
plot_calibrated_points(avg_yield, avg_yield_ref, outlier_indices, a, b, r2_value, args.plot, args.file, args.ref_file)
else:
# If no reference file is provided, just use the original
calibrated_data = modified_data
# Plot the calibrated points with outliers and regression line if requested
if args.plot:
plot_calibrated_points(avg_yield, avg_yield_ref, outlier_indices, a, b, r2_value, args.plot, args.file, args.ref_file)

# Save the modified DataFrame to the specified output file
save_data(calibrated_data, args.output)
print(f"Calibrated yields saved in {args.output}")
if args.plot:
print(f"Plot saved as {args.plot}")

# Select control points
jove_plus_index = args.jove_plus - 2
jove_minus_index = args.jove_minus - 2
control_data = select_control_points(modified_data, jove_plus_index, jove_minus_index, args.num_control_points)

# Save the new control points
outf = os.path.splitext(args.output)[0] + '_control_points.csv'
save_data(control_data, outf)
print(f"New control points saved in {outf}")

print(f"Modified file saved as {args.output}")
Binary file removed tests/data/learner/calibrator/input/~$plate3.xlsx
Binary file not shown.
58 changes: 58 additions & 0 deletions tests/learner/test_calibrator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import unittest
import pandas as pd
import numpy as np
from icfree.learner.calibrator import calculate_yield, add_calibrated_yield, fit_regression_with_outlier_removal

class TestCalibrator(unittest.TestCase):
def setUp(self):
# Create sample data for testing
self.data = pd.DataFrame({
'Fluorescence_1': [100, 200, 300, 400],
'Fluorescence_2': [150, 250, 350, 450]
})
self.jove_plus_line = 5
self.jove_minus_line = 2
self.a = 1.5
self.b = 0.5
self.y = np.array([1, 2, 3, 4, 5])
self.y_ref = np.array([1.2, 1.9, 3.1, 4.0, 5.1])
self.r2_limit = 0.95

def test_calculate_yield(self):
# Test the calculate_yield function
result = calculate_yield(self.data, self.jove_plus_line, self.jove_minus_line)
expected_columns = ['Fluorescence_1', 'Fluorescence_2', 'Yield_1', 'Yield_2']
self.assertTrue(all([col in result.columns for col in expected_columns]))

# Check if yields are calculated correctly
# Autofluorescence is the mean of fluorescences for jove_minus_line
autofluorescence = np.mean([self.data[fluo][self.jove_minus_line-2] for fluo in self.data if 'Fluorescence' in fluo])
# Reference is the mean of fluorescences for jove_plus_line
reference = np.mean([self.data[fluo][self.jove_plus_line-2] for fluo in self.data if 'Fluorescence' in fluo])
expected_yield_1 = (self.data['Fluorescence_1'] - autofluorescence) / (reference - autofluorescence)
pd.testing.assert_series_equal(result['Yield_1'], expected_yield_1, check_names=False)

def test_add_calibrated_yield(self):
# Test the add_calibrated_yield function
yield_data = calculate_yield(self.data, self.jove_plus_line, self.jove_minus_line)
result = add_calibrated_yield(yield_data, self.a, self.b)
expected_columns = ['Calibrated Yield_1', 'Calibrated Yield_2']
self.assertTrue(all([col in result.columns for col in expected_columns]))

# Check if calibrated yields are calculated correctly
expected_calibrated_yield_1 = self.a * result['Yield_1'] + self.b
pd.testing.assert_series_equal(result['Calibrated Yield_1'], expected_calibrated_yield_1, check_names=False)

def test_fit_regression_with_outlier_removal(self):
# Test the fit_regression_with_outlier_removal function
a, b, r2_value, outliers = fit_regression_with_outlier_removal(self.y, self.y_ref, self.r2_limit)

# Check if the regression coefficients and R2 value are within expected limits
self.assertIsInstance(a, float)
self.assertIsInstance(b, float)
self.assertGreaterEqual(r2_value, self.r2_limit)
self.assertIsInstance(outliers, list)
self.assertTrue(all(isinstance(i, np.int64) for i in outliers))

if __name__ == '__main__':
unittest.main()

0 comments on commit 572d055

Please sign in to comment.